I have a code which does ocr or converts pdf to txt in python and uses a regex based approach to find out categories of document. I want my code to be exposed as an api. I am using flask for this task. I am getting 404 Not Found error while running the url.
My Document category extraction code goes like this: Name of file is dtd.py
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import re
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
def convert(fname, pages=None,encoding='utf-8'):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
if len(text)>=500:
regex3=re.search(r"\d+(?:[.-]\w+)*\s*(General Information|Process validation|Manufacturer(s)Reference Standards or Materials|Container Closure Systems|Stability Summary and Conclusions|Post Approval Stability Protocol and Stability Commitment)",text,re.IGNORECASE)
return regex3
else:
pdffile = wi(filename = fname, resolution = 300)
pdfImg = pdffile.convert('jpeg')
imgBlobs = []
for img in pdfImg.sequence:
page = wi(image = img)
imgBlobs.append(page.make_blob('jpeg'))
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# extracted_text = []
for imgBlob in imgBlobs:
im= Image.open(io.BytesIO(imgBlob))
text2 = pytesseract.image_to_string(im, lang = 'eng')
regex3=re.search(r"\d+(?:[.-]\w+)*\s*(General Information|Manufacturer(s)|Process Validation|Batch Formula|Description of Manufacturing Process and Process Controls|Container Closure Systems|Stability Summary and Conclusions|Post Approval Stability Protocol and Stability Commitment)",text2,re.IGNORECASE)
return regex3
convert(r'D:\files\00ac4250-d746-4c8a-b3-2798b0c2d4f9.pdf')
My flask api code is like this: It's called app.py
import dtd
from dtd import convert
from flask import Flask, request
from flask_restful import Resource, Api
#from flask.views import MethodView
app = Flask(__name__)
api = Api(app)
#convert(r'D:\files\67cecf40-71cf-4fc4-82e1-696ca41a9fba.pdf')
class dtdtext(Resource):
def get(self, result):
return {'data': dtd.convert(result)}
#api.add_resource(dtdtext, '/dtd/<result>')
categories=convert(r'D:\files\67cecf40-71cf-4fc4-82e1-696ca41a9fba.pdf')
@app.route('/dtd')
def returnResult():
return {'data': categories}
if __name__ == '__main__':
app.run()
The dtd.py will return the name of a category example 'Manufacturers' and I want to display it as a rest api. How to effectively do this
500 Internal Error in stacktrace:
* Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [02/Jul/2019 17:58:42] "GET / HTTP/1.1" 404 -
[2019-07-02 17:58:47,184] ERROR in app: Exception on /dtd [GET]
Traceback (most recent call last):
File "C:\Users\biltu\Anaconda3\lib\site-packages\flask\app.py", line 1982, in wsgi_app
response = self.full_dispatch_request()
File "C:\Users\biltu\Anaconda3\lib\site-packages\flask\app.py", line 1615, in full_dispatch_request
return self.finalize_request(rv)
File "C:\Users\biltu\Anaconda3\lib\site-packages\flask\app.py", line 1630, in finalize_request
response = self.make_response(rv)
File "C:\Users\biltu\Anaconda3\lib\site-packages\flask\app.py", line 1740, in make_response
rv = self.response_class.force_type(rv, request.environ)
File "C:\Users\biltu\Anaconda3\lib\site-packages\werkzeug\wrappers.py", line 921, in force_type
response = BaseResponse(*_run_wsgi_app(response, environ))
File "C:\Users\biltu\Anaconda3\lib\site-packages\werkzeug\test.py", line 923, in run_wsgi_app
app_rv = app(environ, start_response)
TypeError: 'dict' object is not callable