Python: Extract text from multiple pdf and paste on excel

Python: Extract text from multiple pdf and paste on excel - python

i'm a total new in python, could you help me correct this code?
I would like to add 2 things:
do the operation on multiple pdf and not just one and pasting the content in A2,A3 A4 and so on
if possible writing in the another row (B2,B3,B4) the name of the pdf file.
Thank you in advance, this is the code i'm working with
import PyPDF2
import openpyxl
pdfFileObj = open("file.pdf", 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
wb = openpyxl.load_workbook('excel.xlsx')
sheet = wb.active
sheet.title = 'MyPDF'
sheet['A1'] = mytext
wb.save('excel.xlsx')
print('DONE!!')
I've modified the code as suggested and the cycle seems to get all the pages! but maybe i have to work with "sheet[f'A{row}'].value = '\n'.join(output)" because it seems to print a lot of spaces
import PyPDF2
import openpyxl
import os
import glob
root_dir = "your directory"
filenames = []
# root_dir needs a trailing slash (i.e. /root/dir/)
for filename in glob.iglob(root_dir + '**/**', recursive=True):
if filename.lower().endswith('.pdf'):
filenames.append(os.path.join(directory, filename))
wb = openpyxl.load_workbook('excel.xlsx')#your file excel
sheet = wb.active
sheet.title = 'MyPDF'
for row, filename in enumerate(filenames, start=1):
with open(filename, 'rb') as f:
pdfReader = PyPDF2.PdfFileReader(f)
count=pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
for i in range(count):
page = pdfReader.getPage(i)
output = []
output = page.extractText()
print(output)
sheet[f'A{row}'].value = '\n'.join(output)
sheet[f'B{row}'].value = filename
wb.save('excel.xlsx') #your file excel
print('DONE!!')

You basically want to put the code you wrote which reads the pdf file into a for loop which iterates over the filenames (in this case, the filenames are stored as a tuple).
Using enumerate, row increments every iteration of the loop, and starts at 1. So the text and filename will be put into A1 and B1, then A2 and B2, and so on.
import PyPDF2
import openpyxl
filenames = ("file.pdf",
"file2.pdf",
"file3.pdf",
)
wb = openpyxl.load_workbook('excel.xlsx')
sheet = wb.active
sheet.title = 'MyPDF'
for row, filename in enumerate(filenames, start=1):
with open(filename, 'rb') as f:
pdfReader = PyPDF2.PdfFileReader(f)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
sheet[f'A{row}'].value = mytext
sheet[f'B{row}'].value = filename
wb.save('excel.xlsx')
print('DONE!!')
You can get a list of all the filenames ending in .pdf quite easily by iterating over all the files in a directory, and checking if the filename ends in .pdf. If it does, use os.path.join to give you the full filepath, and append it to the filenames list.
You could also use the glob module, too.
import os
filenames = []
directory = r"C:\Stuff\PDF Files"
for filename in os.listdir(directory):
if filename.lower().endswith(".pdf"):
filenames.append(os.path.join(directory, filename))
Updated code:
import PyPDF2
import openpyxl
import os
import glob
import re
import itertools
# Used to strip characters that can't be written to a spreadsheet
# See https://stackoverflow.com/a/93029/3589122
control_chars = ''.join(map(chr, itertools.chain(range(0x00,0x20), range(0x7f,0xa0))))
control_char_re = re.compile('[%s]' % re.escape(control_chars))
def remove_control_chars(s):
return control_char_re.sub('', s)
root_dir = 'your directory' # root_dir needs a trailing slash (i.e. /root/dir/)
filenames = (filename for filename in glob.iglob(root_dir + '/**/*.pdf', recursive=True))
wb = openpyxl.load_workbook('excel.xlsx') # your file excel
sheet = wb.active
sheet.title = 'MyPDF'
row = 1
for filename in filenames:
with open(filename, 'rb') as f:
try:
pdfReader = PyPDF2.PdfFileReader(f)
count = pdfReader.numPages
output = []
for i in range(count):
print(i, filename)
page = pdfReader.getPage(i)
output.append(page.extractText())
#print(output)
except Exception as e:
print(f'Error: PyPDF2 could not read {filename}. Continuing... ({e})')
continue
sheet[f'A{row}'].value = '\n'.join(remove_control_chars(output))
sheet[f'B{row}'].value = filename
row += 1
wb.save('excel.xlsx') #your file excel
print('DONE!!')

have you tried with more than 6/7 files? i get this error with 7 pdf
TypeError Traceback (most recent call last)
<ipython-input-14-07fb0aa603b8> in <module>
23 for i in range(count):
24 page = pdfReader.getPage(i)
---> 25 output.append(page.extractText())
26 print(output)
27
~\anaconda3\lib\site-packages\PyPDF2\_page.py in extractText(self, Tj_sep, TJ_sep)
1283 """
1284 deprecate_with_replacement("extractText", "extract_text")
-> 1285 return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep)
1286
1287 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
~\anaconda3\lib\site-packages\PyPDF2\_page.py in extract_text(self, Tj_sep, TJ_sep, space_width)
1261 :return: a string object.
1262 """
-> 1263 return self._extract_text(self, self.pdf, space_width, PG.CONTENTS)
1264
1265 def extract_xform_text(
~\anaconda3\lib\site-packages\PyPDF2\_page.py in _extract_text(self, obj, pdf, space_width, content_key)
1243 text = ""
1244 else:
-> 1245 process_operation(operator, operands)
1246 output += text # just in case of
1247 return output
~\anaconda3\lib\site-packages\PyPDF2\_page.py in process_operation(operator, operands)
1195 tm_matrix[5] -= TL
1196 elif operator == b"Tj":
-> 1197 text += operands[0].translate(cmap)
1198 else:
1199 return None
TypeError: a bytes-like object is required, not 'dict'

Related

Split/Extract PDF pages between two strings in python

I am trying to split/extract PDF pages between two strings - excluding pages that contains both strings.
For example,
String1 = "String1"
String2 = "String2"
If page 2 has "String1" and page 10 has "String2", then the output PDF should contain pages from 3 to 9.
The below script extracts all the pages which contains or have the string and creates a single PDF will all the pages containing the string.
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import fitz
import re
nameList = list()
directory = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\CLAIMS Analysis\Format 2(Wire)"
output = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\New folder"
for file in os.listdir(directory):
if not file.endswith(".pdf"):
continue
with open(os.path.join(directory,file), 'rb') as pdfFileObj: # Changes here
doc = fitz.open(directory+ "\\" + file)
nameList.append(str(file))
docPageCount = doc.page_count
reader = PdfReader(pdfFileObj)
writer = PdfWriter()
pageNo = list()
# Open the pdf file
object = PdfReader(doc.name, 'rb') # Get number of pages
NumPages = docPageCount # Enter code here
String = "REMIT-TO-CODE" # Extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
if re.search(String,Text):
##print("Pattern Found on Page: " + str(i))
pageNo.append(str(i))
minPg = min(pageNo, key=float)
minPg = int(minPg)
for page_num in range(minPg, docPageCount):
page = reader.pages[page_num]
# This is CPU intensive! It ZIPs the contents of the page
page.compress_content_streams()
end = output+"\\"+"PAYMENT_WIRE_STEP_1_2_" + file
writer.add_page(page)
with open(end, "wb") as fh:
writer.remove_links()
writer.write(fh)

Can't Move / Delete PDF after processing with pdfrw

I've updated the question to contain the bulk of the code as I feel there may be some of it that is blocking each other...
Can be tested by simply adding a pdf file or two to your c:\temp folder (on windows).
I've just started with Python so may be missing basic stuff...
import glob
from datetime import datetime
from pathlib import Path
import PyPDF4
from pdfrw import PdfReader, PdfWriter
def safe_open_pdf(pdf):
pdf_reader = None
result = True
file = open(pdf, 'rb')
try:
pdf_reader = PyPDF4.PdfFileReader(file)
result = True
except:
# some older PDF files on my disk raise a missing EOF error, which cannot be handled by PyPDF4
print(pdf.split('\\')[-1] + " needs to be fixed")
result = False
if not result:
# if file had EOF error, I "rebuild" it with PdfReader and PdfWriter
x = PdfReader(pdf)
y = PdfWriter()
y.addpages(x.pages)
y.write(pdf)
pdf_reader = PyPDF4.PdfFileReader(file)
return pdf_reader
def move_processed_pdf(source_file):
Path(new_path).mkdir(parents=True, exist_ok=True)
print("Copying to " + new_path + new_file)
f = open(PDFFile, 'rb')
x = PdfReader(f)
y = PdfWriter()
y.addpages(x.pages)
y.write(new_path + new_file)
f.close()
# time.sleep(5)
Path(PDFFile).unlink()
if __name__ == '__main__':
relevant_path = 'C:\\temp\\'
file_count = 0
new_path = 'C:\\temp\\processed\\'
for PDFFile in glob.iglob(relevant_path + '*.pdf', recursive=True):
new_file = datetime.today().strftime('%Y-%m-%d') + PDFFile.split('\\')[-1]
print('Processing File: ' + PDFFile.split('\\')[-1])
pdfReader = safe_open_pdf(PDFFile)
file_count += 1
num_pages = pdfReader.numPages
print(num_pages)
page_count = 0
text = ''
while page_count < num_pages:
pageObj = pdfReader.getPage(page_count)
page_count += 1
text += pageObj.extractText()
# Main processing occurs here
move_processed_pdf(PDFFile)
the issue I get is PermissionError: [WinError 32] The process cannot access the file because it is being used by another process.
folders and files exist.
any ideas?

How to search all file types in directory for regular expression

So, I want to search my whole directory for files that contain a list of regular expressions. That includes: directories, pdfs, and csv files. I can succesfully do this task when searching for only text files but search all file types is the struggle. Below is my work so far:
import glob
import re
import PyPDF2
#-------------------------------------------------Input----------------------------------------------------------------------------------------------
folder_path = "/home/"
file_pattern = "/*"
folder_contents = glob.glob(folder_path + file_pattern)
#Search for Emails
regex1= re.compile(r'\S+#\S+')
#Search for Phone Numbers
regex2 = re.compile(r'\d\d\d[-]\d\d\d[-]\d\d\d\d')
#Search for Locations
regex3 =re.compile("([A-Z]\w+), ([A-Z]{2})")
for file in folder_contents:
read_file = open(file, 'rt').read()
if readile_file == pdf:
pdfFileObj = open('pdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
content= pageObj.extractText())
if regex1.findall(read_file) or regex2.findall(read_file) or regex3.findall(read_file):
print ("YES, This file containts PHI")
print(file)
else:
print("No, This file DOES NOT contain PHI")
print(file)
When I run this i get this error:
YES, This file containts PHI
/home/e136320/sample.txt
No, This file DOES NOT contain PHI
/home/e136320/medicalSample.txt
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-129-be0b68229c20> in <module>()
19
20 for file in folder_contents:
---> 21 read_file = open(file, 'rt').read()
22 if readile_file == pdf:
23 # creating a pdf file object
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-128-1537605cf636> in <module>()
18
19 for file in folder_contents:
---> 20 read_file = open(file, 'rt').read()
21 if regex1.findall(read_file) or regex2.findall(read_file) or regex3.findall(read_file):
22 print ("YES, This file containts PHI")
/jupyterhub_env/lib/python3.5/codecs.py in decode(self, input, final)
319 # decode input (taking the buffer into account)
320 data = self.buffer + input
--> 321 (result, consumed) = self._buffer_decode(data, self.errors, final)
322 # keep undecoded input until the next call
323 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc7 in position 10: invalid continuation byte
Any suggestions?

You can't open a pdf file like that, it is expecting a plain text file. You could use something like this:
fn, ext = os.path.splitext(file)
if ext == '.pdf':
open_function = PyPDF2.PdfFileReader
else: # plain text
open_function = open
with open_function(file, 'rt') as open_file:
# Do something with open file...
This snippet checks the file extension then assigns an open function depending on what it finds, this is a bit naive and could be done better in a method similar to the one shown in this answer.

Your issue occured on you open different file type in same way. You have to sort them out. CSV can read directly, but pdf can not do so. I used re.search(r".*(?=pdf$)",file) to prevent from 2.pdf.csv is considered is an pdf file
import glob
import re
import PyPDF2
#-------------------------------------------------Input----------------------------------------------------------------------------------------------
folder_path = "/home/e136320/"
file_pattern = "/*"
folder_contents = glob.glob(folder_path + file_pattern)
#Search for Emails
regex1= re.compile(r'\S+#\S+')
#Search for Phone Numbers
regex2 = re.compile(r'\d\d\d[-]\d\d\d[-]\d\d\d\d')
#Search for Locations
regex3 =re.compile("([A-Z]\w+), ([A-Z]{2})")
for file in folder_contents:
if re.search(r".*(?=pdf$)",file):
#this is pdf
with open(file, 'rb') as pdfFileObj:
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
content = pageObj.extractText()
read_file = #
elif re.search(r".*(?=csv$)",file):
#this is csv
with open(file,"r+",encoding="utf-8") as csv:
read_file = csv.read()
else:
#print("{}".format(file))
continue
if regex1.findall(read_file) or regex2.findall(read_file) or regex3.findall(read_file):
print ("YES, This file containts PHI")
print(file)
else:
print("No, This file DOES NOT contain PHI")
print(file)

Only portions of data are being written to a cvs file, rest is missing

I am parsing through many xml files and putting certain information into a csv file. Because my xml files are named: "1.xml", "2.xml", etc... I am using a for loop to cycle through my different Xml file titles. However, based on the range that I use on my for loop, my csv file contains different data. For example, when my for loop range is 1:200 my csv file includes info from my xml files 1 to 199. However, when I change my range to 1:300, my csv file only contains info for my xml files 217 to 249. The info actually stored on my csv file changes based on what I put in as my range for my for loop. Has anyone else had this error and do you have any solutions?
My code is below:
import xml.etree.ElementTree as ET
import csv
from pathlib import Path
# open a file for writing
data_labels = open('DataLabels.csv', 'w', newline='')
missing_files = open('MissingFiles.csv', 'w', newline = '')
# create the csv writer object
csvwriter = csv.writer(data_labels)
csvwriter2 = csv.writer(missing_files)
data_head = []
data = []
missingfiles = 0
missfiles = []
MediaId = "Media Id"
#data_head.append (MediaId)
Family = "Family"
#data_head.append (Family)
Species = "Species"
#data_head.append (Species)
Genus = "Genus"
Content = "Content"
ClassId = "ClassId"
#data_head.append (Genus)
data_head.append(MediaId)
# Family = member.find('Family').tag
data_head.append(Content)
data_head.append(ClassId)
data_head.append(Family)
# Species = member.find('Species').tag
data_head.append(Species)
# Genus = member.find('Genus').tag
data_head.append(Genus)
csvwriter.writerow(data_head)
for i in range (1, 190):
#print (i)
data = []
inputfilename = str(i)+ ".xml"
my_file = Path(inputfilename)
if my_file.is_file():
data_labels = open('DataLabels.csv', 'w', newline='')
tree = ET.parse(inputfilename)
root = tree.getroot()
MediaId = root [2].text
Content = root[4].text
ClassId = root[5].text
Family = root[6].text
Species = root[7].text
Genus = root[8].text
#print (vote)
#count = 0
#for Image in root.find('MediaId'):
#print (child.tag, child.attrib)
#name = child.find('MediaId').text
# print (Image.find ('MediaId').text)
##csvwriter.writerow (data_head)
#data = []
#if count == 0:
# print ("count is zero i'm in loop")
# MediaId = member.find('MediaId').tag
# count = count + 1
#else:
#MediaId = root.findall('MediaId').text
data.append(MediaId)
data.append (Content)
data.append (ClassId)
#Family = member.find('Family').text
data.append(Family)
#Species = member.find('Species').text
data.append(Species)
#Genus = member.find('Genus').text
data.append(Genus)
csvwriter.writerow(data)
data_labels.close()
#print (data)
else:
missingfiles = missingfiles +1
missfiles = []
missfiles.append(inputfilename)
csvwriter2.writerow(missfiles)
print ("missing", missingfiles, "files")
data_labels.close()
missing_files.close()
print ("done")

Open the csv in append mode ,else you are just overwriting the same file.

I think you need to divide your script in small readable functions.
First, you can create a function to parse a XML file:
import xml.etree.ElementTree as ET
def parse_xml_file(xml_path):
""" Parse an XML file and return the data. """
# type: (str) -> list
tree = ET.parse(xml_path)
root = tree.getroot()
return [
root[2].text,
root[4].text,
root[5].text,
root[6].text,
root[7].text,
root[8].text]
This function parse a XML file and return one record containing a list of values.
Then, you can create a function to iterate a list of XML files (existing files) dans populate the CSV file:
import csv
import io
import os
def populate_data_labels(xml_path_list, work_dir="."):
header = ["Media Id", "Family", "Species", "Genus", "Content", "ClassId"]
with io.open(os.path.join(work_dir, 'DataLabels.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in xml_path_list:
writer.writerow(parse_xml_file(xml_path))
This function use parse_xml_file() to extract each record.
You can create a function to log the missing files. You can use CSV format (or a simple text file):
def populate_missing_files(missing_files, work_dir="."):
header = ["Filename"]
with io.open(os.path.join(work_dir, 'MissingFiles.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in missing_files:
writer.writerow([os.path.basename(xml_path)])
Finally, you can write a function which search the XML files and call the previous functions:
def parse_work_dir(work_dir="."):
all_files = [os.path.join(work_dir, "{0}.xml".format(idx))
for idx in range(1, 190)]
existing_files = (path for path in all_files if os.path.exists(path))
populate_data_labels(existing_files, work_dir)
missing_files = (path for path in all_files if not os.path.exists(path))
populate_missing_files(missing_files, work_dir)
Usage:
parse_work_dir("/path/to/your/working/dir")

Having trouble into saving something to a csv file

My program does all that I want, but is not saving the final data to the csv file, I used a print before it to see if the data was right and it is, It is just not writing to the csv file, I'm using 'a' because I don't want it to rewrite what's already written, but it is still returning an error.
here's the part of the code:
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
#csvline.encode('utf-8')
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(csvline)
Here's the error:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\final.py", line 95, in <module>
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
TypeError: 'encoding' is an invalid keyword argument for this function
Here's the entire program code in case of need
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
#import unicodecsv as csv
import csv
#import pickle
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
answer = browser.response.text
#print(answer)
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(csvline)
EDIT
It's working, here's the code working
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import unicodecsv as csv
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
if not file_name.endswith('.pdf'):
continue
file_path = os.path.join(dir_path, file_name)
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0]
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
answer = browser.response.text
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
my_file = codecs.open('final_output.csv', 'a', 'utf-8')
my_file.write(csvline)

At the end there is a problem with your code
writer = csv.writer(f)
csv.writer(csvline) # here is the problem
See you initialize the writer, but then you don't use it.
writer = csv.writer(f)
writer.writerow(csvline)

Here :
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
csv.writer (csvline)
You are instanciating a csv.writer, but not using it. This should read:
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.write(csvline)
Now there are quite a few other problems with your code, the first one being to manually create the 'csvline as text then using csv.writer to store it to file. csv.writer.write() expects a list of rows (tuples) and takes care of properly escaping what needs to be escaped, inserting the proper delimiters etc. It also has a writerow() method that takes a single tuple and so avoid building the whole list in memory FWIW.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python: Extract text from multiple pdf and paste on excel - python

Related

Split/Extract PDF pages between two strings in python

Can't Move / Delete PDF after processing with pdfrw

How to search all file types in directory for regular expression

Only portions of data are being written to a cvs file, rest is missing

Having trouble into saving something to a csv file

Categories

Resources