Open, save and extract text PDFs from links in python dataframe - python

I would like to iterate through PDF links saved in python dataframe. The goal is to open the PDF links, save the PDFs and extract text from them, then save the text from each corresponding link in a new column.
Dataframe looks like this:
URL
0 https://westafricatradehub.com/wp-content/uploads/2021/07/RFA-WATIH-1295_Senegal-RMNCAH-Activity_English-Version.pdf
1 https://westafricatradehub.com/wp-content/uploads/2021/07/RFA-WATIH-1295_Activit%C3%A9-RMNCAH-S%C3%A9n%C3%A9gal_Version-Fran%C3%A7aise.pdf
2 https://westafricatradehub.com/wp-content/uploads/2021/07/Attachment-2_Full-Application-Template_Senegal-RMNCAH-Activity_English-Version.docx
3 https://westafricatradehub.com/wp-content/uploads/2021/07/Pi%C3%A8ce-Jointe-2_Mod%C3%A8le-de-Demande-Complet_Activit%C3%A9-RMNCAH-S%C3%A9n%C3%A9gal_Version-Fran%C3%A7aise.docx
4 https://westafricatradehub.com/wp-content/uploads/2021/07/Attachment-3_Trade-Hub-Performance-Indicators-Table.xlsx
5 https://westafricatradehub.com/wp-content/uploads/2021/07/Attachment-10_Project-Budget-Template-RMNCAH.xlsx
6 https://westafricatradehub.com/wp-content/uploads/2021/08/Senegal-Health-RFA-Webinar-QA.pdf
7 https://westafricatradehub.com/wp-content/uploads/2021/02/APS-WATIH-1021_Catalytic-Business-Concepts-Round-2.pdf
8 https://westafricatradehub.com/wp-content/uploads/2021/02/APS-WATIH-1021_Concepts-d%E2%80%99Affaires-Catalytiques-2ieme-Tour.pdf
9 https://westafricatradehub.com/wp-content/uploads/2021/06/APS-WATIH-1247_Research-Development-Round-2.pdf
I was able to do that for one link but not for the whole dataframe
import urllib.request
pdf_link = "https://westafricatradehub.com/wp-content/uploads/2021/07/RFA-WATIH-1295_Senegal-RMNCAH-Activity_English-Version.pdf"
def download_file(download_url, filename):
response = urllib.request.urlopen(download_url)
file = open(filename + ".pdf", 'wb')
file.write(response.read())
file.close()
download_file(pdf_link, "Test")
#Code to extract text from PDF
import textract
text = textract.process("/Users/fze/Dropbox (LCG Team)/LCG Folder (1)/BD Scan Automation/Python codes/Test.PDF")
print(text)
Thank you!

Here you go:
import urllib.request
import textract
def download_file(download_url, filename):
response = urllib.request.urlopen(download_url)
file = open(filename + ".pdf", 'wb')
file.write(response.read())
file.close()
df['Text']=''
for i in range(df.shape[0]):
pdf_link=df.iloc[i,0]
download_file(pdf_link, f"pdf_{i}")
text = textract.process(f"/Users/fze/Dropbox (LCG Team)/LCG Folder (1)/BD Scan Automation/Python codes/pdf_{i}.PDF")
df['Text'][i]=text

Related

Unable to convert multiple PDF pages of a PDF File into a CSV using tabula

I have PDF file whose 1st page data format is different however rest of the pages has the same tabular format.
I want to convert this PDF file which has multiple pages into a CSV file using Python Tabula.
The current code is able to convert PDF to CSV if the PDF has only 2 pages and if it has more that two pages it gives error out of range.
I want to count total number of PDF pages of a PDF File and depending upon the same I want python script to convert the PDF to CSV for different data frames.
I am using Linux box to run this python script.
The code is as given below:
#!/usr/bin/env python3
import tabula
import pandas as pd
import csv
pdf_file='/root/scripts/pdf2xls/Test/21KJAZP011.pdf'
column_names=['Product','Batch No','Machin No','Time','Date','Drum/Bag No','Tare Wt.kg','Gross Wt.kg',
'Net Wt.kg','Blender','Remarks','Operator']
df_results=[] # store results in a list
# Page 1 processing
try:
df1 = tabula.read_pdf('/root/scripts/pdf2xls/Test/21KJAZP011.pdf', pages=1,area=(95,20, 800, 840),columns=[93,180,220,252,310,315,333,367,
410,450,480,520]
,pandas_options={'header': None}) #(top,left,bottom,right)
df1[0]=df1[0].drop(columns=5)
df1[0].columns=column_names
df_results.append(df1[0])
df1[0].head(2)
except Exception as e:
print(f"Exception page not found {e}")
# Page 2 processing
try:
df2 = tabula.read_pdf('/root/scripts/pdf2xls/Test/21KJAZP011.pdf', pages=2,area=(10,20, 800, 840),columns=[93,180,220,252,310,315,330,370,
410,450,480,520]
,pandas_options={'header': None}) #(top,left,bottom,right)
row_with_Sta = df2[0][df2[0][0] == 'Sta'].index.tolist()[0]
df2[0] = df2[0].iloc[:row_with_Sta]
df2[0]=df2[0].drop(columns=5)
df2[0].columns=column_names
df_results.append(df2[0])
df2[0].head(2)
except Exception as e:
print(f"Exception page not found {e}")
#res:wult = pd.concat([df1[0],df2[0],df3[0]]) # concate both the pages and then write to CSV
result = pd.concat(df_results) # concate list of pages and then write to CSV
result.to_csv("result.csv")
with open('/root/scripts/pdf2xls/Test/result.csv', 'r') as f_input, open('/root/scripts/pdf2xls/Test/FinalOutput_21KJAZP011.csv', 'w') as f_output:
csv_input = csv.reader(f_input)
csv_output = csv.writer(f_output)
csv_output.writerow(next(csv_input)) # write header
for cols in csv_input:
for i in range(7, 9):
cols[i] = '{:.2f}'.format(float(cols[i]))
csv_output.writerow(cols)
Please suggest how can achieve the same. I am very new to Python and hence unable to put together things.
Try pdfpumber https://github.com/jsvine/pdfplumber, worked for me like a charm
pdffile = 'your file'
with pdfplumber.open(pdffile) as pdf:
for i in range(len(pdf.pages)):
first_page = pdf.pages[i]
rawdata = first_page.extract_table()
Extract Multiple Tables from PDF using multiple_tables options using tabula
multiple_tables=True
from tabula import convert_into
table_file = r"PDF_path"
output_csv = r"out_csv"
df = convert_into(table_file, output_csv, output_format='csv', lattice=False, stream=True, multiple_tables=True, pages="all")

read multiple files and save to xls in columns (pypdf2 and xlsxwriterr

I need to take a directory with multiple PDFs and structure it into an xls
but I didn't understand how to make a list in the directory save the data in xls
enter import PyPDF2
import xlsxwriter
#---------------------Input file-----------------------------------#
pdf_file = open('arquivo_file','rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
page = read_pdf.getPage(0)
doc = read_pdf.getOutlines
page_content = page.extractText()
text = page_content.replace("\n", " ").replace("\t", " ").replace(" ", "")
content = page_content.split("\n")
data = content[0]
worksheet.write(1, 1, data)
workbook.close() here
Typically your code would resemble something like this.
import os
import glob
DIRPATH = "/path/to/your/pdf/directory"
# Get list of files with extension .pdf in a given directory
pdf_filepaths = glob.glob(os.path.join(DIRPATH, '*.pdf'))
# Loop over the pdf file-paths
# For each pdf-file:
# 1. read each pdf file
# 2. process the content you read (optional)
# 3. save the processed content to excel file
for i, pdf_filepath in enumerate(pdf_filepaths):
content = read_pdf_file(pdf_filepath)
content = process_data(content)
write_excel_file(filename='out_{i}.xlsx', content=content)
Here I assumed you would have wrapped your reading, processing and writing logic in three functions:
def read_pdf_file(filepath):
# your pdf reading logic goes here
...
return content
def process_data(content):
# your post-reading data-processing logic goes here
...
return content
def write_excel_file(filepath, content):
# your logic for writing to excel-file goes here
...

Extract text from multiple .html files and save them in seperate txt files

files list
I have tried to extract text content from 24 folders, every folder has serval (100+) .HTML files, I need to create 24 .txt files to save text which extracts from .HTML files.
I have done most of the job except saving .txt files, the code I wrote save 24 files all with the same content, I know something wrong in the following part
for number in range(1,25):
with open('Text'+"%02d" % number +" .txt", "w", encoding='utf-8') as text:
for i in passage:
text.write(i+' ')
All code is listed below
# Read files and call functions
from bs4 import BeautifulSoup
import os
import numpy as np
gap_html = os.listdir('gap-html')
print(gap_html)
# print(folder)
passage = list()
# out = "all.txt"
# def Convertfile():
for textFolders in gap_html:
# domain = os.path.abspath(r'../gap-html')
folder = os.path.join(os.path.abspath('gap-html'), textFolders)
# text_folder=os.path.abspath(folder)
# Lists the file names under all folders
textFiles=os.listdir(folder)
for textFile in textFiles :
file=os.path.join(os.path.abspath(folder), textFile)
print(file)
html = open(file, 'r', encoding="utf-8").read()
# print("Start reading file...")
soup = BeautifulSoup(html, features='lxml')
page = soup.find_all('span', {"class": "ocr_cinfo"})
for word in page:
word = word.get_text()
passage.append(word)
for number in range(1,25):
with open('Text'+"%02d" % number +" .txt", "w", encoding='utf-8') as text:
for i in passage:
text.write(i+' ')

I want to open html files in a directory folder sequentially, parse information by using beautifulsoup and save it as a csv file

At first, i tried to open just a file named 'index.html', parse it and saved it as a csv file. This was the code and it worked well. enter image description here
with open('/Users/kwon/Downloads/cnn/index.html') as html_file:
soup = BeautifulSoup(html_file, 'html.parser')
cnn_file = open('cnn2.csv', 'w')
cnn_writer = csv.writer(cnn_file)
cnn_writer.writerow(['filename','date','headline','text'])
filename = 'index000'
print(filename)
date = soup.find(class_='update-time').text
date = date.split(' ')[5]+' '+date.split(' ')[6]+' '+date.split(' ')[7]
print(date)
headline = soup.title.text
headline = headline.split('-')[0]
print(headline)
txt = soup.find(class_="zn zn-body-text zn-body zn--idx-0 zn--ordinary zn-has-multiple-containers zn-has-r'\d*'-containers").text
print(txt)
cnn_writer.writerow([filename, date, headline, txt])
cnn_file.close()
But i want to iterate the same process for all html files(index.html~index591.html) in a directory folder. So i started by using glob module to open files sequentially. Then, tried 'for loop' to parse as i did before. Somehow i don't know how to read and parse them sequentially and name filename as 'index000' to 'index591'. Also if i run the code below i get the error saying 'find() takes no keyword arguments'.
import glob
path = '/Users/kwon-yejin/Downloads/cnn2/*.html'
files=glob.glob(path)
for file in files:
html = open(file, 'r')
soup = bs4.BeautifulSoup(html, 'html.parser')
for line in soup:
filename = 'index000'
print(filename)
date = line.find(class_='update-time').text
date = date.split(' ')[5]+' '+date.split(' ')[6]+' '+date.split(' ')[7]
print(date)
headline = line.title.text
headline = headline.split('-')[0]
print(headline)
txt = line.find(class_="zn zn-body-text zn-body zn--idx-0 zn--ordinary zn-has-multiple-containers zn-has-21-containers").text
print(txt)
Sread and parse them sequentially filename as 'index000' to 'index591'
path = '/Users/kwon-yejin/Downloads/cnn2/'
for i in range(592):
file = path+'index'+str(i).zfill(3)+'.html'
print(file)
/Users/kwon-yejin/Downloads/cnn2/index000.html
/Users/kwon-yejin/Downloads/cnn2/index001.html
/Users/kwon-yejin/Downloads/cnn2/index002.html
/Users/kwon-yejin/Downloads/cnn2/index003.html
..................
/Users/kwon-yejin/Downloads/cnn2/index589.html
/Users/kwon-yejin/Downloads/cnn2/index590.html
/Users/kwon-yejin/Downloads/cnn2/index591.html

BeautifulSoup - scraping a forum page

I'm trying to scrape a forum discussion and export it as a csv file, with rows such as "thread title", "user", and "post", where the latter is the actual forum post from each individual.
I'm a complete beginner with Python and BeautifulSoup so I'm having a really hard time with this!
My current problem is that all the text is split into one character per row in the csv file. Is there anyone out there who can help me out? It would be fantastic if someone could give me a hand!
Here's the code I've been using:
from bs4 import BeautifulSoup
import csv
import urllib2
f = urllib2.urlopen("https://silkroad5v7dywlc.onion.to/index.php?action=printpage;topic=28536.0")
soup = BeautifulSoup(f)
b = soup.get_text().encode("utf-8").strip() #the posts contain non-ascii words, so I had to do this
writer = csv.writer(open('silkroad.csv', 'w'))
writer.writerows(b)
Ok here we go. Not quite sure what I'm helping you do here, but hopefully you have a good reason to be analyzing silk road posts.
You have a few issues here, the big one is that you aren't parsing the data at all. What you're essentially doing with .get_text() is going to the page, highlighting the whole thing, and then copying and pasting the whole thing to a csv file.
So here is what you should be trying to do:
Read the page source
Use soup to break it into sections you want
Save sections in parallel arrays for author, date, time, post, etc
Write data to csv file row by row
I wrote some code to show you what that looks like, it should do the job:
from bs4 import BeautifulSoup
import csv
import urllib2
# get page source and create a BeautifulSoup object based on it
print "Reading page..."
page = urllib2.urlopen("https://silkroad5v7dywlc.onion.to/index.php?action=printpage;topic=28536.0")
soup = BeautifulSoup(page)
# if you look at the HTML all the titles, dates,
# and authors are stored inside of <dt ...> tags
metaData = soup.find_all("dt")
# likewise the post data is stored
# under <dd ...>
postData = soup.find_all("dd")
# define where we will store info
titles = []
authors = []
times = []
posts = []
# now we iterate through the metaData and parse it
# into titles, authors, and dates
print "Parsing data..."
for html in metaData:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
titles.append(text.split("Title:")[1].split("Post by:")[0].strip()) # get Title:
authors.append(text.split("Post by:")[1].split(" on ")[0].strip()) # get Post by:
times.append(text.split(" on ")[1].strip()) # get date
# now we go through the actual post data and extract it
for post in postData:
posts.append(BeautifulSoup(str(post)).get_text().encode("utf-8").strip())
# now we write data to csv file
# ***csv files MUST be opened with the 'b' flag***
csvfile = open('silkroad.csv', 'wb')
writer = csv.writer(csvfile)
# create template
writer.writerow(["Time", "Author", "Title", "Post"])
# iterate through and write all the data
for time, author, title, post in zip(times, authors, titles, posts):
writer.writerow([time, author, title, post])
# close file
csvfile.close()
# done
print "Operation completed successfully."
EDIT: Included solution that can read files from directory and use data from that
Okay, so you have your HTML files in a directory. You need to get a list of files in the directory, iterate through them, and append to your csv file for each file in the directory.
This is the basic logic of our new program.
If we had a function called processData() that took a file path as an argument and appended data from the file to your csv file here is what it would look like:
# the directory where we have all our HTML files
dir = "myDir"
# our csv file
csvFile = "silkroad.csv"
# insert the column titles to csv
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Time", "Author", "Title", "Post"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # increment counter
As it happens our processData() function is more or less what we did before, with a few changes.
So this is very similar to our last program, with a few small changes:
We write the column headers first thing
Following that we open the csv with the 'ab' flag to append
We import os to get a list of files
Here's what that looks like:
from bs4 import BeautifulSoup
import csv
import urllib2
import os # added this import to process files/dirs
# ** define our data processing function
def processData( pageFile ):
''' take the data from an html file and append to our csv file '''
f = open(pageFile, "r")
page = f.read()
f.close()
soup = BeautifulSoup(page)
# if you look at the HTML all the titles, dates,
# and authors are stored inside of <dt ...> tags
metaData = soup.find_all("dt")
# likewise the post data is stored
# under <dd ...>
postData = soup.find_all("dd")
# define where we will store info
titles = []
authors = []
times = []
posts = []
# now we iterate through the metaData and parse it
# into titles, authors, and dates
for html in metaData:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
titles.append(text.split("Title:")[1].split("Post by:")[0].strip()) # get Title:
authors.append(text.split("Post by:")[1].split(" on ")[0].strip()) # get Post by:
times.append(text.split(" on ")[1].strip()) # get date
# now we go through the actual post data and extract it
for post in postData:
posts.append(BeautifulSoup(str(post)).get_text().encode("utf-8").strip())
# now we write data to csv file
# ***csv files MUST be opened with the 'b' flag***
csvfile = open('silkroad.csv', 'ab')
writer = csv.writer(csvfile)
# iterate through and write all the data
for time, author, title, post in zip(times, authors, titles, posts):
writer.writerow([time, author, title, post])
# close file
csvfile.close()
# ** start our process of going through files
# the directory where we have all our HTML files
dir = "myDir"
# our csv file
csvFile = "silkroad.csv"
# insert the column titles to csv
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Time", "Author", "Title", "Post"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # incriment counter

Categories

Resources