My task is to extract text from pdf files. I used pypdf2 library and this is the sample data in the pdf file:
Name: Samplename
Origin: Nil
Address: Sample Address
However, after extracted, this is the result:
Samplename:Name
OriginNil:
Sample AddressNil:
May I know how to re arrange the data as it used to be in the pdf format? Thank you in advance
This is the code that I found somewhere online
from PyPDF2 import PdfFileReader, PdfFileWriter
file_path = 'test.pdf'
pdf = PdfFileReader(file_path)
with open('test.csv', 'w') as f:
for page_num in range(pdf.numPages):
# print('Page: {0}'.format(page_num))
pageObj = pdf.getPage(page_num)
try:
txt = pageObj.extractText()
print(''.center(100, '-'))
except:
pass
else:
f.write('Page {0}\n'.format(page_num+1))
f.write(''.center(100, '-'))
f.write(txt)
f.close()
Related
I'm using PyPDF4 to read text from a PDF I downloaded. This works, but the text string is not readable:
ÓŒŁ–Ł#`#䎖Ł#`#Ä›¥–Ž¢–#¥ŒŒŽ—–fi–Ł
Áfi⁄–fl–Ł–#›ŁƒŒŽfl†£›–
As far as I know the file is not encrypted, I can open it in Acrobat Reader without problem. In reader I can also select / copy / paste the text correctly.
for reference: this is the code:
import glob
import PyPDF4
relevant_path = 'C:\\_Personal\\Mega\\PycharmProjects\\PDFHandler\\docs\\input\\'
if __name__ == '__main__':
for PDFFile in glob.iglob(relevant_path + '*.pdf', recursive=True):
print('Processing File: ' + PDFFile.split('\\')[-1])
pdfReader = PyPDF4.PdfFileReader(PDFFile)
num_pages = pdfReader.numPages
print(num_pages)
page_count = 0
text = ''
while page_count < num_pages:
pageObj = pdfReader.getPage(page_count)
page_count += 1
text += pageObj.extractText()
print(text)
any hints? other packages I could use? ...
I am trying to extract text from PDF, but when I extract the text some words and numbers are missing.
Is it possible to extract the text without missing words and numbers?
from pdf2image import convert_from_path
import pytesseract
from PyPDF2 import PdfReader
path = "file.pdf"
reader = PdfReader(path)
pages = len(reader.pages)
output = open(path.rsplit(".", 1)[0] + ".txt", "a")
for i in range(1, pages + 1):
image = convert_from_path(path, first_page=i, last_page=i)
print(f"Converting page: {i}/{pages}")
say = pytesseract.image_to_string(image[0])
output.write(say + "\n")
output.flush()
print(f"Conversion of {path} Complete")
I am using the code below to get any free journal pdfs from pubmed. It does downloadload something that when I look at it, just consists of the number 1.. Any ideas on where I am going wrong? Thank you
import metapub
from urllib.request import urlretrieve
import textract
from pathlib import Path
another_path='/content/Articles/'
pmid_list=['35566889','33538053', '30848212']
for i in range(len(pmid_list)):
query=pmid_list[i]
#for ind in pmid_df.index:
# query= pmid_df['PMID'][ind]
url = metapub.FindIt(query).url
try:
urlretrieve(url)
file_name = query
out_file = another_path + file_name
with open(out_file, "w") as textfile:
textfile.write(textract.process(out_file,extension='pdf',method='pdftotext',encoding="utf_8",
))
except:
continue
I see two mistakes.
First: urlretrieve(url) saves data in temporary file with random filename - so you can't access it because you don't know its filename. You should use second parameter to save it with own filename.
urlretrieve(url, file_name)
Second: you use the same out_file to process file (process(out_file)) and write result (open(out_file, 'w')) - but first you use open() which deletes all content in file and later it will process empty file. You should first process file and later open it for writing.
data = textract.process(out_file, extension='pdf', method='pdftotext', encoding="utf_8")
with open(out_file, "wb") as textfile: # save bytes
textfile.write(data)
or you should write result with different name (i.e with extension .txt)`
Full working example with other small changes
import os
from urllib.request import urlretrieve
import metapub
import textract
#another_path = '/content/Articles/'
another_path = './'
pmid_list = ['35566889','33538053', '30848212']
for query in pmid_list:
print('query:', query)
url = metapub.FindIt(query).url
print('url:', url)
if url:
try:
out_file = os.path.join(another_path, query)
print('out_file:', out_file)
print('... downloading')
urlretrieve(url, out_file + '.pdf')
print('... processing')
data = textract.process(out_file + '.pdf', extension='pdf', method='pdftotext', encoding="utf_8")
print('... saving')
with open(out_file + '.txt', "wb") as textfile: # save bytes
textfile.write(data)
print('... OK')
except Exception as ex:
print('Exception:', ex)
I would like to iterate through PDF links saved in python dataframe. The goal is to open the PDF links, save the PDFs and extract text from them, then save the text from each corresponding link in a new column.
Dataframe looks like this:
URL
0 https://westafricatradehub.com/wp-content/uploads/2021/07/RFA-WATIH-1295_Senegal-RMNCAH-Activity_English-Version.pdf
1 https://westafricatradehub.com/wp-content/uploads/2021/07/RFA-WATIH-1295_Activit%C3%A9-RMNCAH-S%C3%A9n%C3%A9gal_Version-Fran%C3%A7aise.pdf
2 https://westafricatradehub.com/wp-content/uploads/2021/07/Attachment-2_Full-Application-Template_Senegal-RMNCAH-Activity_English-Version.docx
3 https://westafricatradehub.com/wp-content/uploads/2021/07/Pi%C3%A8ce-Jointe-2_Mod%C3%A8le-de-Demande-Complet_Activit%C3%A9-RMNCAH-S%C3%A9n%C3%A9gal_Version-Fran%C3%A7aise.docx
4 https://westafricatradehub.com/wp-content/uploads/2021/07/Attachment-3_Trade-Hub-Performance-Indicators-Table.xlsx
5 https://westafricatradehub.com/wp-content/uploads/2021/07/Attachment-10_Project-Budget-Template-RMNCAH.xlsx
6 https://westafricatradehub.com/wp-content/uploads/2021/08/Senegal-Health-RFA-Webinar-QA.pdf
7 https://westafricatradehub.com/wp-content/uploads/2021/02/APS-WATIH-1021_Catalytic-Business-Concepts-Round-2.pdf
8 https://westafricatradehub.com/wp-content/uploads/2021/02/APS-WATIH-1021_Concepts-d%E2%80%99Affaires-Catalytiques-2ieme-Tour.pdf
9 https://westafricatradehub.com/wp-content/uploads/2021/06/APS-WATIH-1247_Research-Development-Round-2.pdf
I was able to do that for one link but not for the whole dataframe
import urllib.request
pdf_link = "https://westafricatradehub.com/wp-content/uploads/2021/07/RFA-WATIH-1295_Senegal-RMNCAH-Activity_English-Version.pdf"
def download_file(download_url, filename):
response = urllib.request.urlopen(download_url)
file = open(filename + ".pdf", 'wb')
file.write(response.read())
file.close()
download_file(pdf_link, "Test")
#Code to extract text from PDF
import textract
text = textract.process("/Users/fze/Dropbox (LCG Team)/LCG Folder (1)/BD Scan Automation/Python codes/Test.PDF")
print(text)
Thank you!
Here you go:
import urllib.request
import textract
def download_file(download_url, filename):
response = urllib.request.urlopen(download_url)
file = open(filename + ".pdf", 'wb')
file.write(response.read())
file.close()
df['Text']=''
for i in range(df.shape[0]):
pdf_link=df.iloc[i,0]
download_file(pdf_link, f"pdf_{i}")
text = textract.process(f"/Users/fze/Dropbox (LCG Team)/LCG Folder (1)/BD Scan Automation/Python codes/pdf_{i}.PDF")
df['Text'][i]=text
Steps:
Read multiple .html files in the directory
extract the titles of the html
Need:
- sending the titles into individual .txt files
Expected: Any advise. Ideally I wanted to extract integers from the html files name ('23434.html') and name the text files as '23434.txt'
Results:
- there is no txt file created in the designated path.
- Nothing gets written
for file_name in glob.glob(os.path.join(dir_path, "*.html")):
with open(file_name) as html_file:
soup=BeautifulSoup(html_file)
d=soup.title.get_text()
#resultfile=re.findall('\d+', file_name)
with open("m"+".txt", "w") as outfile:
outfile.write(d)
outfile.close
for fpath in glob.glob(os.path.join(dir_path, "*.html")):
with open(fpath) as html_file:
soup = BeautifulSoup(html_file)
html_title = soup.title.get_text()
html_number = os.path.basename(fpath).rsplit('.',1)[0]
with open(html_number + '.txt', 'w') as outfile:
outfile.write(html_title)