Can't Move / Delete PDF after processing with pdfrw - python

I've updated the question to contain the bulk of the code as I feel there may be some of it that is blocking each other...
Can be tested by simply adding a pdf file or two to your c:\temp folder (on windows).
I've just started with Python so may be missing basic stuff...
import glob
from datetime import datetime
from pathlib import Path
import PyPDF4
from pdfrw import PdfReader, PdfWriter
def safe_open_pdf(pdf):
pdf_reader = None
result = True
file = open(pdf, 'rb')
try:
pdf_reader = PyPDF4.PdfFileReader(file)
result = True
except:
# some older PDF files on my disk raise a missing EOF error, which cannot be handled by PyPDF4
print(pdf.split('\\')[-1] + " needs to be fixed")
result = False
if not result:
# if file had EOF error, I "rebuild" it with PdfReader and PdfWriter
x = PdfReader(pdf)
y = PdfWriter()
y.addpages(x.pages)
y.write(pdf)
pdf_reader = PyPDF4.PdfFileReader(file)
return pdf_reader
def move_processed_pdf(source_file):
Path(new_path).mkdir(parents=True, exist_ok=True)
print("Copying to " + new_path + new_file)
f = open(PDFFile, 'rb')
x = PdfReader(f)
y = PdfWriter()
y.addpages(x.pages)
y.write(new_path + new_file)
f.close()
# time.sleep(5)
Path(PDFFile).unlink()
if __name__ == '__main__':
relevant_path = 'C:\\temp\\'
file_count = 0
new_path = 'C:\\temp\\processed\\'
for PDFFile in glob.iglob(relevant_path + '*.pdf', recursive=True):
new_file = datetime.today().strftime('%Y-%m-%d') + PDFFile.split('\\')[-1]
print('Processing File: ' + PDFFile.split('\\')[-1])
pdfReader = safe_open_pdf(PDFFile)
file_count += 1
num_pages = pdfReader.numPages
print(num_pages)
page_count = 0
text = ''
while page_count < num_pages:
pageObj = pdfReader.getPage(page_count)
page_count += 1
text += pageObj.extractText()
# Main processing occurs here
move_processed_pdf(PDFFile)
the issue I get is PermissionError: [WinError 32] The process cannot access the file because it is being used by another process.
folders and files exist.
any ideas?

Related

reading text from PDF contains unknown encoding

I'm using PyPDF4 to read text from a PDF I downloaded. This works, but the text string is not readable:
ÓŒŁ–Ł#`#䎖Ł#`#Ä›¥–Ž¢–#¥ŒŒŽ—–fi–Ł
Áfi⁄–fl–Ł–#›ŁƒŒŽfl†£›–
As far as I know the file is not encrypted, I can open it in Acrobat Reader without problem. In reader I can also select / copy / paste the text correctly.
for reference: this is the code:
import glob
import PyPDF4
relevant_path = 'C:\\_Personal\\Mega\\PycharmProjects\\PDFHandler\\docs\\input\\'
if __name__ == '__main__':
for PDFFile in glob.iglob(relevant_path + '*.pdf', recursive=True):
print('Processing File: ' + PDFFile.split('\\')[-1])
pdfReader = PyPDF4.PdfFileReader(PDFFile)
num_pages = pdfReader.numPages
print(num_pages)
page_count = 0
text = ''
while page_count < num_pages:
pageObj = pdfReader.getPage(page_count)
page_count += 1
text += pageObj.extractText()
print(text)
any hints? other packages I could use? ...

Python: Extract text from multiple pdf and paste on excel

i'm a total new in python, could you help me correct this code?
I would like to add 2 things:
do the operation on multiple pdf and not just one and pasting the content in A2,A3 A4 and so on
if possible writing in the another row (B2,B3,B4) the name of the pdf file.
Thank you in advance, this is the code i'm working with
import PyPDF2
import openpyxl
pdfFileObj = open("file.pdf", 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
wb = openpyxl.load_workbook('excel.xlsx')
sheet = wb.active
sheet.title = 'MyPDF'
sheet['A1'] = mytext
wb.save('excel.xlsx')
print('DONE!!')
I've modified the code as suggested and the cycle seems to get all the pages! but maybe i have to work with "sheet[f'A{row}'].value = '\n'.join(output)" because it seems to print a lot of spaces
import PyPDF2
import openpyxl
import os
import glob
root_dir = "your directory"
filenames = []
# root_dir needs a trailing slash (i.e. /root/dir/)
for filename in glob.iglob(root_dir + '**/**', recursive=True):
if filename.lower().endswith('.pdf'):
filenames.append(os.path.join(directory, filename))
wb = openpyxl.load_workbook('excel.xlsx')#your file excel
sheet = wb.active
sheet.title = 'MyPDF'
for row, filename in enumerate(filenames, start=1):
with open(filename, 'rb') as f:
pdfReader = PyPDF2.PdfFileReader(f)
count=pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
for i in range(count):
page = pdfReader.getPage(i)
output = []
output = page.extractText()
print(output)
sheet[f'A{row}'].value = '\n'.join(output)
sheet[f'B{row}'].value = filename
wb.save('excel.xlsx') #your file excel
print('DONE!!')
You basically want to put the code you wrote which reads the pdf file into a for loop which iterates over the filenames (in this case, the filenames are stored as a tuple).
Using enumerate, row increments every iteration of the loop, and starts at 1. So the text and filename will be put into A1 and B1, then A2 and B2, and so on.
import PyPDF2
import openpyxl
filenames = ("file.pdf",
"file2.pdf",
"file3.pdf",
)
wb = openpyxl.load_workbook('excel.xlsx')
sheet = wb.active
sheet.title = 'MyPDF'
for row, filename in enumerate(filenames, start=1):
with open(filename, 'rb') as f:
pdfReader = PyPDF2.PdfFileReader(f)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
sheet[f'A{row}'].value = mytext
sheet[f'B{row}'].value = filename
wb.save('excel.xlsx')
print('DONE!!')
You can get a list of all the filenames ending in .pdf quite easily by iterating over all the files in a directory, and checking if the filename ends in .pdf. If it does, use os.path.join to give you the full filepath, and append it to the filenames list.
You could also use the glob module, too.
import os
filenames = []
directory = r"C:\Stuff\PDF Files"
for filename in os.listdir(directory):
if filename.lower().endswith(".pdf"):
filenames.append(os.path.join(directory, filename))
Updated code:
import PyPDF2
import openpyxl
import os
import glob
import re
import itertools
# Used to strip characters that can't be written to a spreadsheet
# See https://stackoverflow.com/a/93029/3589122
control_chars = ''.join(map(chr, itertools.chain(range(0x00,0x20), range(0x7f,0xa0))))
control_char_re = re.compile('[%s]' % re.escape(control_chars))
def remove_control_chars(s):
return control_char_re.sub('', s)
root_dir = 'your directory' # root_dir needs a trailing slash (i.e. /root/dir/)
filenames = (filename for filename in glob.iglob(root_dir + '/**/*.pdf', recursive=True))
wb = openpyxl.load_workbook('excel.xlsx') # your file excel
sheet = wb.active
sheet.title = 'MyPDF'
row = 1
for filename in filenames:
with open(filename, 'rb') as f:
try:
pdfReader = PyPDF2.PdfFileReader(f)
count = pdfReader.numPages
output = []
for i in range(count):
print(i, filename)
page = pdfReader.getPage(i)
output.append(page.extractText())
#print(output)
except Exception as e:
print(f'Error: PyPDF2 could not read {filename}. Continuing... ({e})')
continue
sheet[f'A{row}'].value = '\n'.join(remove_control_chars(output))
sheet[f'B{row}'].value = filename
row += 1
wb.save('excel.xlsx') #your file excel
print('DONE!!')
have you tried with more than 6/7 files? i get this error with 7 pdf
TypeError Traceback (most recent call last)
<ipython-input-14-07fb0aa603b8> in <module>
23 for i in range(count):
24 page = pdfReader.getPage(i)
---> 25 output.append(page.extractText())
26 print(output)
27
~\anaconda3\lib\site-packages\PyPDF2\_page.py in extractText(self, Tj_sep, TJ_sep)
1283 """
1284 deprecate_with_replacement("extractText", "extract_text")
-> 1285 return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep)
1286
1287 mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
~\anaconda3\lib\site-packages\PyPDF2\_page.py in extract_text(self, Tj_sep, TJ_sep, space_width)
1261 :return: a string object.
1262 """
-> 1263 return self._extract_text(self, self.pdf, space_width, PG.CONTENTS)
1264
1265 def extract_xform_text(
~\anaconda3\lib\site-packages\PyPDF2\_page.py in _extract_text(self, obj, pdf, space_width, content_key)
1243 text = ""
1244 else:
-> 1245 process_operation(operator, operands)
1246 output += text # just in case of
1247 return output
~\anaconda3\lib\site-packages\PyPDF2\_page.py in process_operation(operator, operands)
1195 tm_matrix[5] -= TL
1196 elif operator == b"Tj":
-> 1197 text += operands[0].translate(cmap)
1198 else:
1199 return None
TypeError: a bytes-like object is required, not 'dict'

"[Errno 2] No such file or directory" Issue [duplicate]

This question already has answers here:
Trying to use open(filename, 'w' ) gives IOError: [Errno 2] No such file or directory if directory doesn't exist
(3 answers)
Closed 6 months ago.
So my prof. gave me this code as the solution of my homework but when I run it it gives me an error. Can you please help me out? I guess I didn't specify the location of the file but not sure if that's the case.The objective of this question is to generate and read files that contain a list of random numbers.
import random
import os
import time
def fillFile(fileSize, fileName):
# Delete file if exists
if os.path.exists(fileName):
os.remove(fileName)
# Open file
FILE = open(fileName, "w")
# Write to file
for i in range(fileSize):
r = random.randint(0,fileSize+1000)
FILE.write(str(r) + "\n")
FILE.close()
def readFile(fileName):
# Open file
if os.path.exists(fileName):
FILE = open(fileName,"r")
else:
print(fileName + " does not exist!")
exit()
# Read File
alist = []
for line in FILE:
alist.append(int(line))
FILE.close()
return alist
def mainForFiles():
# Dosyaları oluştur
fileSizes = [1000, 5000, 10000, 25000, 50000, 100000, 200000]
dirName = ".\\filesForAssignment1\\"
# Delete fileStats.txt file if exists
statFileName = "fileStats.txt"
if os.path.exists(statFileName):
os.remove(statFileName)
# open stat file
statFile = open(statFileName, "w")
statFile.write("fillFile")
print("WRITING TO FILES")
for i in fileSizes:
start = time.time()
fillFile(i, dirName+"file"+str(i))
finish = time.time()
statFile.write(" " + str(finish-start))
print("File Size = " + str(i) + " Write Time = " + str(finish-start))
statFile.write("\n")
print("READING FILES")
statFile.write("readFile")
for i in fileSizes:
fileName = dirName+"file"+str(i)
# Dosyayı oku
finish = time.time()
alist = readFile(fileName)
start = time.time()
statFile.write(" " + str(finish-start))
print ("File Size = " + str(i)+ " Dosya Okuma Zamanı = " + str(finish-start))
statFile.write("\n")
statFile.close()
mainForFiles()
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 84, in
<module>
mainForFiles()
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 57, in mainForFiles
fillFile(i, dirName+"file"+str(i))
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 12, in fillFile
FILE = open(fileName, "w")
FileNotFoundError: [Errno 2] No such file or directory: '.\\filesForAssignment1\\file1000'
FileNotFoundError: [Errno 2] No such file or directory: '.\\filesForAssignment1\\file1000'
The w mode causes the file to be created if it doesn't exist (and truncated if it does so the os.remove is not actually useful there), however it does expect intermediate directories to exist.
This means you should ensure the path to the file ('.\\filesForAssignment1) does exist before trying to create the file.
os.makedirs(os.path.dirname(fileName), exists_ok=True)
should do the trick, or
pathlib.Path(fileName).parent.mkdir(parents=True, exists_ok=True)
for a somewhat more modern take on it.
There's a bunch of other minor issues in the script:
the main function should generally be "gated" so modules can be imported without running them
explicitly closing files has fallen out of favor as it's unreliable
when opening files in "text" mode (the default) you should always provide an encoding
pathlib is fun, also that way you should not have to deal with path separators and all that crap
unless it's required to handle that case, I'd just let open(fname, 'r') error out if the file doesn't exist
Here's a version I think should be slightly improved:
import pathlib
import random
import os
import time
def fillFile(fileSize, fileName):
with fileName.open('w', encoding='utf-8') as f:
for i in range(fileSize):
r = random.randint(0,fileSize+1000)
f.write(f"{r}\n")
def readFile(fileName):
with fileName.open(encoding='utf-8') as f:
return [int(line) for line in f]
OUT_DIR = pathlib.Path.cwd().joinpath("filesForAssignment1")
FILE_SIZES = [1000, 5000, 10000, 25000, 50000, 100000, 200000]
def mainForFiles():
# Dosyaları oluştur
OUT_DIR.mkdir(parents=True, exist_ok=True) # make sure the directory exists
statFilePath = pathlib.Path("fileStats.txt")
with statFilePath.open('w', encoding='utf-8') as statFile:
statFile.write("fillFile")
print("WRITING TO FILES")
for i in FILE_SIZES:
start = time.time()
fillFile(i, OUT_DIR.joinpath(f'file{i}'))
finish = time.time()
statFile.write(f" {finish-start}")
print(f"File Size = {i} Write Time = {finish-start})")
statFile.write("\n")
print("READING FILES")
statFile.write("readFile")
for i in FILE_SIZES:
f = OUT_DIR.joinpath(f'file{i}')
# Dosyayı oku
start = time.time()
alist = readFile(f)
finish = time.time()
statFile.write(f" {finish-start}")
print (f"File Size = {i} Dosya Okuma Zamanı = {finish-start}")
statFile.write("\n")
if __name__ == '__main__':
mainForFiles()
exit() is not doing what you want, it continues with the code.
def readFile(fileName):
# Open file
if os.path.exists(fileName):
FILE = open(fileName,"r")
else:
print(fileName + " does not exist!")
return
# Read File
alist = []
for line in FILE:
alist.append(int(line))
FILE.close()
return alist

Export Excel Module via Python

I'm trying to replicate the exporting of a Code Module from an Excel sheet in Python.
The following works in VBA:
Public Sub ExportModules()
Dim wb As Workbook
Set wb = ThisWorkbook
Dim D As String
Dim N
D = ThisWorkbook.Path
For Each VBComp In wb.VBProject.VBComponents
If (VBComp.Type = 1) Then
N = D + "\" + VBComp.Name + ".txt"
VBComp.Export N
End If
Next
End Sub
And I have the following in Python:
import os
import sys
import glob
from win32com.client import Dispatch
scripts_dir = 'folder address'
com_instance = Dispatch("Excel.Application")
com_instance.Visible = False
com_instance.DisplayAlerts = False
for script_file in glob.glob(os.path.join(scripts_dir, "*.xlsm")):
print "Processing: %s" % script_file
(file_path, file_name) = os.path.split(script_file)
objworkbook = com_instance.Workbooks.Open(script_file)
for xlmodule in objworkbook.VBProject.VBComponents:
xlmodule.Export('export file name')
My question is, what do I have to do in Python to replicate the Export of the file as per the VBA code?
Use the default oletools xltrails provides a good way to extract .bas files from .xlsm or other excel files
import os
import shutil
from oletools.olevba3 import VBA_Parser
EXCEL_FILE_EXTENSIONS = ('xlsb', 'xls', 'xlsm', 'xla', 'xlt', 'xlam',)
def parse(workbook_path):
vba_path = workbook_path + '.vba'
vba_parser = VBA_Parser(workbook_path)
vba_modules = vba_parser.extract_all_macros() if vba_parser.detect_vba_macros() else []
for _, _, _, content in vba_modules:
decoded_content = content.decode('latin-1')
lines = []
if '\r\n' in decoded_content:
lines = decoded_content.split('\r\n')
else:
lines = decoded_content.split('\n')
if lines:
name = lines[0].replace('Attribute VB_Name = ', '').strip('"')
content = [line for line in lines[1:] if not (
line.startswith('Attribute') and 'VB_' in line)]
if content and content[-1] == '':
content.pop(len(content)-1)
lines_of_code = len(content)
non_empty_lines_of_code = len([c for c in content if c])
if non_empty_lines_of_code > 0:
if not os.path.exists(os.path.join(vba_path)):
os.makedirs(vba_path)
with open(os.path.join(vba_path, name + '.bas'), 'w') as f:
f.write('\n'.join(content))
if __name__ == '__main__':
for root, dirs, files in os.walk('.'):
for f in dirs:
if f.endswith('.vba'):
shutil.rmtree(os.path.join(root, f))
for f in files:
if f.endswith(EXCEL_FILE_EXTENSIONS):
parse(os.path.join(root, f))
I have tried it and it works great.
Ref: https://www.xltrail.com/blog/auto-export-vba-commit-hook

Having trouble into saving something to a csv file

My program does all that I want, but is not saving the final data to the csv file, I used a print before it to see if the data was right and it is, It is just not writing to the csv file, I'm using 'a' because I don't want it to rewrite what's already written, but it is still returning an error.
here's the part of the code:
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
#csvline.encode('utf-8')
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(csvline)
Here's the error:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\final.py", line 95, in <module>
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
TypeError: 'encoding' is an invalid keyword argument for this function
Here's the entire program code in case of need
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
#import unicodecsv as csv
import csv
#import pickle
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
answer = browser.response.text
#print(answer)
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(csvline)
EDIT
It's working, here's the code working
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import unicodecsv as csv
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
if not file_name.endswith('.pdf'):
continue
file_path = os.path.join(dir_path, file_name)
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0]
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
answer = browser.response.text
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
my_file = codecs.open('final_output.csv', 'a', 'utf-8')
my_file.write(csvline)
At the end there is a problem with your code
writer = csv.writer(f)
csv.writer(csvline) # here is the problem
See you initialize the writer, but then you don't use it.
writer = csv.writer(f)
writer.writerow(csvline)
Here :
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
csv.writer (csvline)
You are instanciating a csv.writer, but not using it. This should read:
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.write(csvline)
Now there are quite a few other problems with your code, the first one being to manually create the 'csvline as text then using csv.writer to store it to file. csv.writer.write() expects a list of rows (tuples) and takes care of properly escaping what needs to be escaped, inserting the proper delimiters etc. It also has a writerow() method that takes a single tuple and so avoid building the whole list in memory FWIW.

Categories

Resources