PyPDF2 format changes after extracted

PyPDF2 format changes after extracted - python

My task is to extract text from pdf files. I used pypdf2 library and this is the sample data in the pdf file:
Name: Samplename
Origin: Nil
Address: Sample Address
However, after extracted, this is the result:
Samplename:Name
OriginNil:
Sample AddressNil:
May I know how to re arrange the data as it used to be in the pdf format? Thank you in advance
This is the code that I found somewhere online
from PyPDF2 import PdfFileReader, PdfFileWriter
file_path = 'test.pdf'
pdf = PdfFileReader(file_path)
with open('test.csv', 'w') as f:
for page_num in range(pdf.numPages):
# print('Page: {0}'.format(page_num))
pageObj = pdf.getPage(page_num)
try:
txt = pageObj.extractText()
print(''.center(100, '-'))
except:
pass
else:
f.write('Page {0}\n'.format(page_num+1))
f.write(''.center(100, '-'))
f.write(txt)
f.close()

Related

reading text from PDF contains unknown encoding

I'm using PyPDF4 to read text from a PDF I downloaded. This works, but the text string is not readable:
ÓŒŁ–Ł#`#äŽ–Ł#`#Ä›¥–Ž¢–#¥ŒŒŽ—–ﬁ–Ł
Áﬁ⁄–ﬂ–Ł–#›ŁƒŒŽﬂ†£›–
As far as I know the file is not encrypted, I can open it in Acrobat Reader without problem. In reader I can also select / copy / paste the text correctly.
for reference: this is the code:
import glob
import PyPDF4
relevant_path = 'C:\\_Personal\\Mega\\PycharmProjects\\PDFHandler\\docs\\input\\'
if __name__ == '__main__':
for PDFFile in glob.iglob(relevant_path + '*.pdf', recursive=True):
print('Processing File: ' + PDFFile.split('\\')[-1])
pdfReader = PyPDF4.PdfFileReader(PDFFile)
num_pages = pdfReader.numPages
print(num_pages)
page_count = 0
text = ''
while page_count < num_pages:
pageObj = pdfReader.getPage(page_count)
page_count += 1
text += pageObj.extractText()
print(text)
any hints? other packages I could use? ...

Convert Scanned PDF to Text

I am trying to extract text from PDF, but when I extract the text some words and numbers are missing.
Is it possible to extract the text without missing words and numbers?
from pdf2image import convert_from_path
import pytesseract
from PyPDF2 import PdfReader
path = "file.pdf"
reader = PdfReader(path)
pages = len(reader.pages)
output = open(path.rsplit(".", 1)[0] + ".txt", "a")
for i in range(1, pages + 1):
image = convert_from_path(path, first_page=i, last_page=i)
print(f"Converting page: {i}/{pages}")
say = pytesseract.image_to_string(image[0])
output.write(say + "\n")
output.flush()
print(f"Conversion of {path} Complete")

getting weird results from metapub and pubmed

I am using the code below to get any free journal pdfs from pubmed. It does downloadload something that when I look at it, just consists of the number 1.. Any ideas on where I am going wrong? Thank you
import metapub
from urllib.request import urlretrieve
import textract
from pathlib import Path
another_path='/content/Articles/'
pmid_list=['35566889','33538053', '30848212']
for i in range(len(pmid_list)):
query=pmid_list[i]
#for ind in pmid_df.index:
# query= pmid_df['PMID'][ind]
url = metapub.FindIt(query).url
try:
urlretrieve(url)
file_name = query
out_file = another_path + file_name
with open(out_file, "w") as textfile:
textfile.write(textract.process(out_file,extension='pdf',method='pdftotext',encoding="utf_8",
))
except:
continue

I see two mistakes.
First: urlretrieve(url) saves data in temporary file with random filename - so you can't access it because you don't know its filename. You should use second parameter to save it with own filename.
urlretrieve(url, file_name)
Second: you use the same out_file to process file (process(out_file)) and write result (open(out_file, 'w')) - but first you use open() which deletes all content in file and later it will process empty file. You should first process file and later open it for writing.
data = textract.process(out_file, extension='pdf', method='pdftotext', encoding="utf_8")
with open(out_file, "wb") as textfile: # save bytes
textfile.write(data)
or you should write result with different name (i.e with extension .txt)`
Full working example with other small changes
import os
from urllib.request import urlretrieve
import metapub
import textract
#another_path = '/content/Articles/'
another_path = './'
pmid_list = ['35566889','33538053', '30848212']
for query in pmid_list:
print('query:', query)
url = metapub.FindIt(query).url
print('url:', url)
if url:
try:
out_file = os.path.join(another_path, query)
print('out_file:', out_file)
print('... downloading')
urlretrieve(url, out_file + '.pdf')
print('... processing')
data = textract.process(out_file + '.pdf', extension='pdf', method='pdftotext', encoding="utf_8")
print('... saving')
with open(out_file + '.txt', "wb") as textfile: # save bytes
textfile.write(data)
print('... OK')
except Exception as ex:
print('Exception:', ex)

Open, save and extract text PDFs from links in python dataframe

I would like to iterate through PDF links saved in python dataframe. The goal is to open the PDF links, save the PDFs and extract text from them, then save the text from each corresponding link in a new column.
Dataframe looks like this:
URL
0 https://westafricatradehub.com/wp-content/uploads/2021/07/RFA-WATIH-1295_Senegal-RMNCAH-Activity_English-Version.pdf
1 https://westafricatradehub.com/wp-content/uploads/2021/07/RFA-WATIH-1295_Activit%C3%A9-RMNCAH-S%C3%A9n%C3%A9gal_Version-Fran%C3%A7aise.pdf
2 https://westafricatradehub.com/wp-content/uploads/2021/07/Attachment-2_Full-Application-Template_Senegal-RMNCAH-Activity_English-Version.docx
3 https://westafricatradehub.com/wp-content/uploads/2021/07/Pi%C3%A8ce-Jointe-2_Mod%C3%A8le-de-Demande-Complet_Activit%C3%A9-RMNCAH-S%C3%A9n%C3%A9gal_Version-Fran%C3%A7aise.docx
4 https://westafricatradehub.com/wp-content/uploads/2021/07/Attachment-3_Trade-Hub-Performance-Indicators-Table.xlsx
5 https://westafricatradehub.com/wp-content/uploads/2021/07/Attachment-10_Project-Budget-Template-RMNCAH.xlsx
6 https://westafricatradehub.com/wp-content/uploads/2021/08/Senegal-Health-RFA-Webinar-QA.pdf
7 https://westafricatradehub.com/wp-content/uploads/2021/02/APS-WATIH-1021_Catalytic-Business-Concepts-Round-2.pdf
8 https://westafricatradehub.com/wp-content/uploads/2021/02/APS-WATIH-1021_Concepts-d%E2%80%99Affaires-Catalytiques-2ieme-Tour.pdf
9 https://westafricatradehub.com/wp-content/uploads/2021/06/APS-WATIH-1247_Research-Development-Round-2.pdf
I was able to do that for one link but not for the whole dataframe
import urllib.request
pdf_link = "https://westafricatradehub.com/wp-content/uploads/2021/07/RFA-WATIH-1295_Senegal-RMNCAH-Activity_English-Version.pdf"
def download_file(download_url, filename):
response = urllib.request.urlopen(download_url)
file = open(filename + ".pdf", 'wb')
file.write(response.read())
file.close()
download_file(pdf_link, "Test")
#Code to extract text from PDF
import textract
text = textract.process("/Users/fze/Dropbox (LCG Team)/LCG Folder (1)/BD Scan Automation/Python codes/Test.PDF")
print(text)
Thank you!

Here you go:
import urllib.request
import textract
def download_file(download_url, filename):
response = urllib.request.urlopen(download_url)
file = open(filename + ".pdf", 'wb')
file.write(response.read())
file.close()
df['Text']=''
for i in range(df.shape[0]):
pdf_link=df.iloc[i,0]
download_file(pdf_link, f"pdf_{i}")
text = textract.process(f"/Users/fze/Dropbox (LCG Team)/LCG Folder (1)/BD Scan Automation/Python codes/pdf_{i}.PDF")
df['Text'][i]=text

output unable to write output in txt file

Steps:
Read multiple .html files in the directory
extract the titles of the html
Need:
- sending the titles into individual .txt files
Expected: Any advise. Ideally I wanted to extract integers from the html files name ('23434.html') and name the text files as '23434.txt'
Results:
- there is no txt file created in the designated path.
- Nothing gets written
for file_name in glob.glob(os.path.join(dir_path, "*.html")):
with open(file_name) as html_file:
soup=BeautifulSoup(html_file)
d=soup.title.get_text()
#resultfile=re.findall('\d+', file_name)
with open("m"+".txt", "w") as outfile:
outfile.write(d)
outfile.close

for fpath in glob.glob(os.path.join(dir_path, "*.html")):
with open(fpath) as html_file:
soup = BeautifulSoup(html_file)
html_title = soup.title.get_text()
html_number = os.path.basename(fpath).rsplit('.',1)[0]
with open(html_number + '.txt', 'w') as outfile:
outfile.write(html_title)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

PyPDF2 format changes after extracted - python

Related

reading text from PDF contains unknown encoding

Convert Scanned PDF to Text

getting weird results from metapub and pubmed

Open, save and extract text PDFs from links in python dataframe

output unable to write output in txt file

Categories

Resources