This code downloads metadata from a repository, writes that data to file, downloads a pdf, turns that pdf to text, then deletes the original pdf:
for record in records:
record_data = [] # data is stored in record_data
for name, metadata in record.metadata.items():
for i, value in enumerate(metadata):
if value:
record_data.append(value)
fulltext = ''
file_path = ''
file_path_metadata = ''
unique_id = str(uuid.uuid4())
for data in record_data:
if 'Fulltext' in data:
# the link to the pdf
fulltext = data.replace('Fulltext ', '')
# path where the txt file will be stored
file_path = '/' + os.path.basename(data).replace('.pdf', '') + unique_id + '.pdf'
# path where the metadata will be stored
file_path_metadata = '/' + os.path.basename(data).replace('.pdf', '') + unique_id + '_metadata.txt'
print fulltext, file_path
# Write metadata to file
if fulltext:
try:
write_metadata = open(path_to_institute + file_path_metadata, 'w')
for i, data in enumerate(record_data):
write_metadata.write('MD_' + str(i) + ': ' + data.encode('utf8') + '\n')
write_metadata.close()
except Exception as e:
# Exceptions due to missing path to file
print 'Exception when writing metadata: {}'.format(e)
print fulltext, path_to_institute, file_path_metadata
# Download pdf
download_pdf(fulltext, path_to_institute + file_path)
# Create text file and delete pdf
pdf2text(path_to_institute + file_path)
Doing some measurements, the download_pdf method and pdf2text method takes quite a long time.
Here are those methods:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
import os
def remove_file(path):
try:
os.remove(path)
except OSError, e:
print ("Error: %s - %s." % (e.filename,e.strerror))
def pdf2text(path):
string_handling = StringIO()
parser = PDFParser(open(path, 'r'))
save_file = open(path.replace('.pdf', '.txt'), 'w')
try:
document = PDFDocument(parser)
except Exception as e:
print '{} is not a readable document. Exception {}'.format(path, e)
return
if document.is_extractable:
recourse_manager = PDFResourceManager()
device = TextConverter(recourse_manager,
string_handling,
codec='ascii',
laparams=LAParams())
interpreter = PDFPageInterpreter(recourse_manager, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# write to file
save_file.write(string_handling.getvalue())
save_file.close()
# deletes pdf
remove_file(path)
else:
print(path, "Warning: could not extract text from pdf file.")
return
def download_pdf(url, path):
try:
f = urllib2.urlopen(url)
except Exception as e:
print e
f = None
if f:
data = f.read()
with open(path, "wb") as code:
code.write(data)
code.close()
So I'm thinking I should run those in parallel.
I tried this, but it did not word:
pool = mp.Pool(processes=len(process_data))
for i in process_data:
print i
pool.apply(download_pdf, args=(i[0], i[1]))
pool = mp.Pool(processes=len(process_data))
for i in process_data:
print i[1]
pool.apply(pdf2text, args=(i[1],))
It takes just as long time? The printing happens as if the processes are run one at a time...
I finally found out a way to run the code in parallel. Unbelievable how much faster it got.
import multiprocessing as mp
jobs = []
for i in process_data:
p = mp.Process(target=download_pdf, args=(i[0], i[1]))
jobs.append(p)
p.start()
for i, data in enumerate(process_data):
print data
p = mp.Process(target=pdf2text, args=(data[1],))
jobs[i].join()
p.start()
here is a great article on how to build stuff in parallel,
it uses multiprocessing.dummy to run things in different threads
here is a little example:
from urllib2 import urlopen
from multiprocessing.dummy import Pool
urls = [url_a,
url_b,
url_c
]
pool = Pool()
res = pool.map(urlopen, urls)
pool.close()
pool.join()
for python >= 3.3 I suggest concurrent.futures
example:
import functools
import urllib.request
import futures
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
def load_url(url, timeout):
return urllib.request.urlopen(url, timeout=timeout).read()
with futures.ThreadPoolExecutor(50) as executor:
future_list = executor.run_to_futures(
[functools.partial(load_url, url, 30) for url in URLS])
example taken from: here
Related
I am using the code below to get any free journal pdfs from pubmed. It does downloadload something that when I look at it, just consists of the number 1.. Any ideas on where I am going wrong? Thank you
import metapub
from urllib.request import urlretrieve
import textract
from pathlib import Path
another_path='/content/Articles/'
pmid_list=['35566889','33538053', '30848212']
for i in range(len(pmid_list)):
query=pmid_list[i]
#for ind in pmid_df.index:
# query= pmid_df['PMID'][ind]
url = metapub.FindIt(query).url
try:
urlretrieve(url)
file_name = query
out_file = another_path + file_name
with open(out_file, "w") as textfile:
textfile.write(textract.process(out_file,extension='pdf',method='pdftotext',encoding="utf_8",
))
except:
continue
I see two mistakes.
First: urlretrieve(url) saves data in temporary file with random filename - so you can't access it because you don't know its filename. You should use second parameter to save it with own filename.
urlretrieve(url, file_name)
Second: you use the same out_file to process file (process(out_file)) and write result (open(out_file, 'w')) - but first you use open() which deletes all content in file and later it will process empty file. You should first process file and later open it for writing.
data = textract.process(out_file, extension='pdf', method='pdftotext', encoding="utf_8")
with open(out_file, "wb") as textfile: # save bytes
textfile.write(data)
or you should write result with different name (i.e with extension .txt)`
Full working example with other small changes
import os
from urllib.request import urlretrieve
import metapub
import textract
#another_path = '/content/Articles/'
another_path = './'
pmid_list = ['35566889','33538053', '30848212']
for query in pmid_list:
print('query:', query)
url = metapub.FindIt(query).url
print('url:', url)
if url:
try:
out_file = os.path.join(another_path, query)
print('out_file:', out_file)
print('... downloading')
urlretrieve(url, out_file + '.pdf')
print('... processing')
data = textract.process(out_file + '.pdf', extension='pdf', method='pdftotext', encoding="utf_8")
print('... saving')
with open(out_file + '.txt', "wb") as textfile: # save bytes
textfile.write(data)
print('... OK')
except Exception as ex:
print('Exception:', ex)
Need to extract the specific text only from Invoice PDF file having different PDF structure using python and store the output data into particular excel columns. All the PDF files have different structure but same content values.
Tried to solve it but not able to extract the specific text values only.
Sample PDF file :
Click to view the sample file
Need to Extract Invoice ID, Issue Date, Subject, Amount Due from the whole PDF file.
Script i have used so far:
import PyPDF2
import re
pdfFileObj = open('test.pdf','rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
text = str(pageObj.extractText())
quotes = re.findall(r'"[^"]*"',text)
print(quotes)
You have a very nice pdf document, because your pdf has form fields, so you can use them directly to read the data:
import PyPDF2
pdfFileObj = open('test.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
fields = pdfReader.getFormTextFields()
print(fields["Invoice ID"])
print(fields["Issue Date"])
print(fields["Subject"])
print(fields["Amount Due"])
EDIT:
I combined your requested data (from here: How to extract only specific text from PDF file using python) in a little script with 3 opportunities of parsing the pdf (for your 3 pdfs). The problem is your pdfs have a lot of differences and the packages have some advantages on different pdfs, so i think you have to combine this stuff. The thing is, that you try all functions, till it gets a result. I hope this is an good start for you. You may have to change the regexes, if you have more different pdfs and may you have to store all regex (per field) in an array and use them on the different functions so you have 3 functions for parsing and 4 lists of regexes to use in 2 of the functions.
import PyPDF2
import re
import os
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
def parse_pdf_by_regex_2(filename: str) -> dict:
output_string = StringIO()
with open(filename, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
regex_invoice_no = re.compile(r"Invoice No.:\s*(\w+)\s")
regex_order_no = re.compile(r"IRN:\s*(\d+)")
regex_due_date = re.compile(r"Due Date: (\d{2}\.\d{2}\.\d{4})")
regex_total_due = re.compile(r"([\d,.]+) \n\nTotal Invoice Value\(in words\)")
try:
return {"invoice_id": re.search(regex_invoice_no, output_string.getvalue()).group(1),
"issue_date": re.search(regex_due_date, output_string.getvalue()).group(1),
"subject": re.search(regex_order_no, output_string.getvalue()).group(1),
"amount": re.search(regex_total_due, output_string.getvalue()).group(1)}
except AttributeError as err:
print("Not all elements have been found")
return {}
def parse_pdf_by_form_fields(filename: str) -> dict:
with open(filename, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
try:
fields = pdf_reader.getFormTextFields()
except TypeError as err:
# print("No FormFields available")
return {}
try:
# You can also check if onyly missing some values, maybe this can happen, but this is up to your data
return {"invoice_id": fields["Invoice ID"],
"issue_date": fields["Issue Date"],
"subject": fields["Subject"],
"amount": fields["Amount Due"]}
except KeyError as err:
# print(f"Key not found: '{err.args[0]}'")
return {}
def parse_pdf_by_regex(filename: str) -> dict:
with open(filename, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
text_data = ""
for page_no in range(pdf_reader.getNumPages()):
text_data += pdf_reader.getPage(page_no).extractText()
regex_invoice_no = re.compile(r"Invoice Number\s*(INV-\d+)")
regex_order_no = re.compile(r"Order Number(\d+)")
regex_due_date = re.compile(r"Due Date(\S+ \d{1,2}, \d{4})")
regex_total_due = re.compile(r"Total Due(\$\d+\.\d{1,2})")
try:
return {"invoice_id": re.search(regex_invoice_no, text_data).group(1),
"issue_date": re.search(regex_due_date, text_data).group(1),
"subject": re.search(regex_order_no, text_data).group(1),
"amount": re.search(regex_total_due, text_data).group(1)}
except AttributeError as err:
# print("Not all elements have been found")
return {}
def parse_pdf(filename: str) -> dict:
# Hint: ':=' is available since pythoon 3.8
if data := parse_pdf_by_form_fields(filename=fname):
return data
elif data := parse_pdf_by_regex(filename=fname):
return data
elif data := parse_pdf_by_regex_2(filename=fname):
return data
else:
print("No data found")
return {}
if __name__ == '__main__':
for fname in os.listdir("."):
if fname.startswith("testfile"):
print(f"check {fname}")
print(parse_pdf(filename=fname))
I am trying to read and find data from pdf files with tika. I have several libreoffice and pdf files with same name but different extension.
First with this straight forward code:
from tika import parser
import os
from timeit import default_timer as timer
files_to_search = []
times = []
dir_list = os.listdir(r'\\LS-WVLEF8\backup\laskut\secun')
for file_name in dir_list:
if file_name.find('nterme')>0 and file_name.find('pdf')>0:
files_to_search.append(file_name)
for a in range(20):
tic = timer()
path_and_name=""
for item in files_to_search:
path_and_name = r'\\LS-WVLEF8\backup\laskut\secun'+'\\'+item
try:
file_data = parser.from_file(path_and_name)
text = file_data['content']
text = text.strip()
if text.find('835528')>1:
print('found '+item)
except Exception as e:
print('Exception')
print(e)
while 1:
pass
tac = timer()
times.append(tac-tic)
print('single time ',tac-tic)
with open('single.txt', 'a') as the_file:
the_file.write(str(tac-tic)+'\n')
average = sum(times)/20
max = times.index(max(times))
with open('single.txt', 'a') as the_file:
the_file.write('average = '+str(average)+'\n')
the_file.write('max = '+str(max)+'\n')
It works slowly. I get average average = 1.732
Then with this. With multiprocessing.
from tika import tika, parser
from multiprocessing import Pool
import os
from timeit import default_timer as timer
def tika_parser(files_to_search):
try:
data = parser.from_file(r'\\LS-WVLEF8\backup\laskut\secun\\'+files_to_search)
text = data['content']
text = text.strip()
if text.find('835528')>1:
print('found ' + files_to_search)
except Exception as e:
print('Exception')
print(e)
while 1:
pass
if __name__ == '__main__':
files_to_search = []
times = []
dir_list = os.listdir(r'\\LS-WVLEF8\backup\laskut\secun')
for file_name in dir_list:
if file_name.find('nterme')>0 and file_name.find('pdf')>0:
files_to_search.append(file_name)
for a in range(20):
tic = timer()
pool = Pool()
pool.map(tika_parser, files_to_search)
pool.close()
tac = timer()
times.append(tac-tic)
print('multi time ',tac-tic)
with open('multi.txt', 'a') as the_file:
the_file.write(str(tac-tic)+'\n')
average = sum(times)/20
max = times.index(max(times))
with open('multi.txt', 'a') as the_file:
the_file.write('average = '+str(average)+'\n')
the_file.write('max = '+str(max)+'\n')
This is a bit faster. I get average = 1.320
Is there a way to do this faster with tika? Or should I look for PyPDF2 or something else?
I created a function that will open each file in a directory and extract the text from each file and output it in an excel sheet using Pandas. The indexing for each file type seems to be working just fine.However the extracted text from each file comes out next to each other in a list and not separated and next to their corresponding file.
See bottom of script for current output and the out put I want.
** I believe the problem lies in the loader() function which takes in a path, goes through each directory file checks the file .ext and extracts the text.
Thank you!
import re
#import PyPDF4
import pathlib
from pathlib import Path
import shutil
from datetime import datetime
import time
from configparser import ConfigParser
import glob
import fileinput
import pandas as pd
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import docx2txt
from pptx import Presentation
import more_itertools as mit
p = Path('C:/Users/XXXX/Desktop/test')
txt_files = list(p.rglob('*txt'))
PDF_files = list(p.rglob('*pdf'))
csv_files = list(p.rglob('*csv'))
docx_files = list(p.rglob('*docx'))
pptx_files = list(p.rglob('*pptx'))
#excel_files = list(p.rglob('xls'))
def pdf_to_text(x):
# PDFMiner
rsrcmgr = PDFResourceManager()
sio = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Extract text
fp = open(x, 'rb')
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
fp.close()
# Get text from StringIO
text = sio.getvalue()
# Cleanup
device.close()
sio.close()
return text
#-------------------------------------------------------------------------------
def loader(path):
with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
docx_out,pptx_out,pdf_out = [],[],[]
if path.suffix == ".pdf":
for name1 in PDF_files:
pdf_out.append(pdf_to_text(name1))
return pdf_out
elif path.suffix == ".docx":
for name2 in docx_files:
docx_out.append(docx2txt.process(name2))
return docx_out
elif path.suffix == ".pptx":
for file in pptx_files:
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
pptx_out.append(run.text)
return pptx_out
else:
return f.readlines()
print(pdf_out)
def file_generator():
files = txt_files+PDF_files+csv_files+docx_files+pptx_files
for item in files:
yield {
"path": item,
"name": item.name[0:],
"created": time.ctime(item.stat().st_ctime),
"modified": time.ctime(item.stat().st_mtime),
"content": loader(item)
}
def to_xlsx():
df = pd.DataFrame.from_dict(file_generator())
df.head()
df.to_excel("tester4.xlsx")
if __name__ == "__main__":
to_xlsx()
#------------------------------------------------------------
OUTPUT EXAMPLE
current output:
content
["content_test1","content_test2"] test1.pdf
["content_test1","content_test2"] test2.pdf
What I want:
["content_test1"] test1.pdf
["content_test2"] test2.pdf
The appends called by each filetype_out function look like they are adding the contents of each file to the end of the list pertaining to that filetype. If you want to generate a unique list with the contents of each individual file, I'd recommend creating a separate dict for each filetype, which then includes individual lists for each file processed. Taking the PDFs as an example:
def loader(path):
with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
docx_out,pptx_out,pdf_out = {},{},{}
if path.suffix == ".pdf":
for name1 in PDF_files:
name1_contents = []
name1_contents.append(pdf_to_text(name1))
pdf_out[name1] = name1_contents
return pdf_out
To then print out your results in a similar way as you have been:
for name, contents in pdf_out:
print(contents + ' ' + name)
I'm processing multiple pdf files using PyPDF2 but my script hangs somewhere. All I can see in my console is some "startxref on same line as offset" which I'm correct is a warning so by right it should still go to the finally block and return an empty string.
Am I doing something wrong?
import PyPDF2
import sys
import os
def decode_pdf(src_filename):
out_str=""
try:
f = open(str(src_filename), "rb")
read_pdf = PyPDF2.PdfFileReader(f)
number_of_pages = read_pdf.getNumPages()
for i in range(0,number_of_pages):
page = read_pdf.getPage(i)
out_str = out_str + " " + page.extractText()
out_str = ''.join(out_str.splitlines())
f.close()
except:
print("Exception on pdf")
print(sys.exc_info())
out_str = ""
finally:
return out_str
I was facing this issue too and couldn't solve it using PyPDF2. I solved mine with pdfminer using the example from here
Copying the relevant code here below
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
call the function convert() as below
convert('myfile.pdf', pages=[5,7])