Merging PDFs while retaining custom page numbers (aka pagelabels) and bookmarks - python

I'm trying to automate merging several PDF files and have two requirements: a) existing bookmarks AND b) pagelabels (custom page numbering) need to be retained.
Retaining bookmarks when merging happens by default with PyPDF2 and pdftk, but not with pdfrw.
Pagelabels are consistently not retained in PyPDF2, pdftk or pdfrw.
I am guessing, after having searched a lot, that there is no straightforward approach to doing what I want. If I'm wrong then I hope someone can point to this easy solution. But, if there is no easy solution, any tips on how to get this going in python will be much appreciated!
Some example code:
1) With PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileMerger, PdfFileReader
tmp1 = PdfFileReader('file1.pdf', 'rb')
tmp2 = PdfFileReader('file2.pdf', 'rb')
#extracting pagelabels is easy
pl1 = tmp1.trailer['/Root']['/PageLabels']
pl2 = tmp2.trailer['/Root']['/PageLabels']
#but PdfFileWriter or PdfFileMerger does not support writing from what I understand
So I dont know how to proceed from here
2) With pdfrw (has more promise)
from pdfrw import PdfReader, PdfWriter
writer = PdfWriter()
#read 1st file
tmp1 = PdfReader('file1')
#add the pages
writer.addpages(tmp1.pages)
#copy bookmarks to writer
writer.trailer.Root.Outlines = tmp1.Root.Outlines
#copy pagelabels to writer
writer.trailer.Root.PageLabels = tmp1.Root.PageLabels
#read second file
tmp2 = PdfReader('file2')
#append pages
writer.addpages(tmp2.pages)
# so far so good
Page numbers of bookmarks from 2nd file need to be offset before adding them, but when reading outlines I almost always get (IndirectObject, XXX) instead of page numbers. Its unclear how to get page numbers for each label and bookmark using pdfrw. So, I'm stuck again
zp

As mentioned in my comment, I'm posting a generic solution to merge several pdfs that works in PyPDF2. Dont know what is different to make this work in PyPDF2 other than initializing pls as ArrayObject()
from PyPDF2 import PdfFileWriter, PdfFileMerger, PdfFileReader
import PyPDF2.pdf as PDF
# pls holds all the pagelabels as we iterate through multiple pdfs
pls = PDF.ArrayObject()
# used to offset bookmarks
pageCount = 0
cpdf = PdfFileMerger()
# pdffiles is a list of all files to be merged
for i in range(len(pdffiles)):
tmppdf = PdfFileReader(pdffiles[i], 'rb')
cpdf.append(tmppdf)
# copy all the pagelabels which I assume is present in all files
# you could use 'try' in case no pagelabels are present
plstmp = tmppdf.trailer['/Root']['/PageLabels']['/Nums']
# sometimes keys are indirect objects
# so, iterate through each pagelabel and...
for j in range(len(plstmp)):
# ... get the actual values
plstmp[j] = plstmp[j].getObject()
# offset pagenumbers by current count of pages
if isinstance(plstmp[j], int):
plstmp[j] = PDF.NumberObject(plstmp[j] + pageCount)
# once all the pagelabels are processed I append to pls
pls += plstmp
#increment pageCount
pageCount += tmppdf.getNumPages()
# rest follows KevinM's answer
pagenums = PDF.DictionaryObject()
pagenums.update({PDF.NameObject('/Nums') : pls})
pagelabels = PDF.DictionaryObject()
pagelabels.update({PDF.NameObject('/PageLabels') : pagenums})
cpdf.output._root_object.update(pagelabels)
cpdf.write("filename.pdf")

You need to iterate through the existing PageLabels and add them to the merged output, taking care to add an offset to the page index entry, based on the number of pages already added.
This solution also requires PyPDF4, since PyPDF2 produces a weird error (see bottom).
from PyPDF4 import PdfFileWriter, PdfFileMerger, PdfFileReader
# To manipulate the PDF dictionary
import PyPDF4.pdf as PDF
import logging
def add_nums(num_entry, page_offset, nums_array):
for num in num_entry['/Nums']:
if isinstance(num, (int)):
logging.debug("Found page number %s, offset %s: ", num, page_offset)
# Add the physical page information
nums_array.append(PDF.NumberObject(num+page_offset))
else:
# {'/S': '/r'}, or {'/S': '/D', '/St': 489}
keys = num.keys()
logging.debug("Found page label, keys: %s", keys)
number_type = PDF.DictionaryObject()
# Always copy the /S entry
s_entry = num['/S']
number_type.update({PDF.NameObject("/S"): PDF.NameObject(s_entry)})
logging.debug("Adding /S entry: %s", s_entry)
if '/St' in keys:
# If there is an /St entry, fetch it
pdf_label_offset = num['/St']
# and add the new offset to it
logging.debug("Found /St %s", pdf_label_offset)
number_type.update({PDF.NameObject("/St"): PDF.NumberObject(pdf_label_offset)})
# Add the label information
nums_array.append(number_type)
return nums_array
def write_merged(pdf_readers):
# Output
merger = PdfFileMerger()
# For PageLabels information
page_labels = []
page_offset = 0
nums_array = PDF.ArrayObject()
# Iterate through all the inputs
for pdf_reader in pdf_readers:
try:
# Merge the content
merger.append(pdf_reader)
# Handle the PageLabels
# Fetch page information
old_page_labels = pdf_reader.trailer['/Root']['/PageLabels']
page_count = pdf_reader.getNumPages()
# Add PageLabel information
add_nums(old_page_labels, page_offset, nums_array)
page_offset = page_offset + page_count
except Exception as err:
print("ERROR: %s" % err)
# Add PageLabels
page_numbers = PDF.DictionaryObject()
page_numbers.update({PDF.NameObject("/Nums"): nums_array})
page_labels = PDF.DictionaryObject()
page_labels.update({PDF.NameObject("/PageLabels"): page_numbers})
root_obj = merger.output._root_object
root_obj.update(page_labels)
# Write output
merger.write('merged.pdf')
pdf_readers = []
tmp1 = PdfFileReader('file1.pdf', 'rb')
tmp2 = PdfFileReader('file2.pdf', 'rb')
pdf_readers.append(tmp1)
pdf_readers.append(tmp2)
write_merged(pdf_readers)
Note: PyPDF2 produces this weird error:
...
...
File "/usr/lib/python3/dist-packages/PyPDF2/pdf.py", line 552, in _sweepIndirectReferences
data[key] = value
File "/usr/lib/python3/dist-packages/PyPDF2/generic.py", line 507, in __setitem__
raise ValueError("key must be PdfObject")
ValueError: key must be PdfObject

Related

problem duplicating first pdf when combing multiple fillable pdf files with python

I'm trying to combine 40+ fillable pdf into one pdf. Each pdf has one page and they are the same form with different data. I followed the script from (PyPDF2 PdfFileMerger loosing PDF module in merged file) to merge pdf, but the merged output duplicates the first file 45 times (for 45 files). Does anyone know what the issue could be and what is the solution? Thank you in advance!
first i define a function to fill forms:
def set_need_appearances_writer(writer):
try:
catalog = writer._root_object
# get the AcroForm tree and add "/NeedAppearances attribute
if "/AcroForm" not in catalog:
writer._root_object.update({
NameObject("/AcroForm"): IndirectObject(len(writer._objects), 0, writer)})
need_appearances = NameObject("/NeedAppearances")
writer._root_object["/AcroForm"][need_appearances] = BooleanObject(True)
except Exception as e:
print('set_need_appearances_writer() catch : ', repr(e))
return writer
def AYX_PDF_form_fill(i, template, outfile):
try:
field_dictionary = df.to_dict('records')[i]
inputStream = open(template, "rb")
pdf_reader = PdfFileReader(inputStream, strict=False)
pdf_writer = PdfFileWriter()
set_need_appearances_writer(pdf_writer)
pdf_writer.addPage(pdf_reader.getPage(0))
pdf_writer.updatePageFormFieldValues(pdf_writer.getPage(0), field_dictionary)
#set_annotation_flag_writer(pdf_writer, field_dictionary)
outputStream = open(outfile, "wb")
#pdf_writer.encrypt(userPWD, ownerPWD, use_128bit=True)
pdf_writer.write(outputStream)
inputStream.close()
outputStream.close()
except Exception as e:
print('AYX_PDF_form_fill() catch : ', repr(e))
return
Then I call the function to fill forms:
template = 'input/template.pdf'
for i in range(0,len(df)):
outfile = os.path.join('output/pdf',"%i.pdf" % i)
AYX_PDF_form_fill(i, template, outfile)
next I define a function to merge pdfs:
from pdfrw import PdfReader, PdfWriter, PdfName
def merge_pdf_files_pdfrw(pdf_files, output_filename):
output = PdfWriter()
num = 0
output_acroform = None
for pdf in pdf_files:
input = PdfReader(pdf,verbose=False)
output.addpages(input.pages)
if PdfName('AcroForm') in input[PdfName('Root')].keys(): # Not all PDFs have an AcroForm node
source_acroform = input[PdfName('Root')][PdfName('AcroForm')]
if PdfName('Fields') in source_acroform:
output_formfields = source_acroform[PdfName('Fields')]
else:
output_formfields = []
num2 = 0
for form_field in output_formfields:
key = PdfName('T')
old_name = form_field[key].replace('(','').replace(')','') # Field names are in the "(name)" format
form_field[key] = 'FILE_{n}_FIELD_{m}_{on}'.format(n=num, m=num2, on=old_name)
num2 += 1
if output_acroform == None:
# copy the first AcroForm node
output_acroform = source_acroform
else:
for key in source_acroform.keys():
# Add new AcroForms keys if output_acroform already existing
if key not in output_acroform:
output_acroform[key] = source_acroform[key]
# Add missing font entries in /DR node of source file
if (PdfName('DR') in source_acroform.keys()) and (PdfName('Font') in source_acroform[PdfName('DR')].keys()):
if PdfName('Font') not in output_acroform[PdfName('DR')].keys():
# if output_acroform is missing entirely the /Font node under an existing /DR, simply add it
output_acroform[PdfName('DR')][PdfName('Font')] = source_acroform[PdfName('DR')][PdfName('Font')]
else:
# else add new fonts only
for font_key in source_acroform[PdfName('DR')][PdfName('Font')].keys():
if font_key not in output_acroform[PdfName('DR')][PdfName('Font')]:
output_acroform[PdfName('DR')][PdfName('Font')][font_key] = source_acroform[PdfName('DR')][PdfName('Font')][font_key]
if PdfName('Fields') not in output_acroform:
output_acroform[PdfName('Fields')] = output_formfields
else:
# Add new fields
output_acroform[PdfName('Fields')] += output_formfields
num +=1
output.trailer[PdfName('Root')][PdfName('AcroForm')] = output_acroform
output.write(output_filename)
then I call the function.
# the list contains 45 files
pdf_files=['output/pdf/sample1.pdf','output/pdf/sample2.pdf',....]
merged='output/pdf/merged.pdf'
merge_pdf_files_pdfrw(pdf_files,merged)
the merged output file has 45 pages duplicating the information from the first file.

Get PDF attachments using Python

I admit that I am new to Python.
We have to process PDF files with attachments or annotated attachments. I am trying to extract attachments from a PDF file using PyPDF2 library.
The only (!) example found on GitHub contains the following code:
import PyPDF2
def getAttachments(reader):
catalog = reader.trailer["/Root"]
# VK
print (catalog)
#
fileNames = catalog['/Names']['/EmbeddedFiles']['/Names']
And the call is:
rootdir = "C:/Users/***.pdf" # My file path
handler = open(rootdir, 'rb')
reader = PyPDF2.PdfFileReader(handler)
dictionary = getAttachments(reader)
I am getting a KeyError: '/EmbeddedFiles'
A print of the catalog indeed does not contain EmbeddedFiles:
{'/Extensions': {'/ADBE': {'/BaseVersion': '/1.7', '/ExtensionLevel': 3}}, '/Metadata': IndirectObject(2, 0), '/Names': IndirectObject(5, 0), '/OpenAction': IndirectObject(6, 0), '/PageLayout': '/OneColumn', '/Pages': IndirectObject(3, 0), '/PieceInfo': IndirectObject(7, 0), '/Type': '/Catalog'}
This particular PDF contains 9 attachments. How can I get them?
Too Long for comments, and I have not tested personally this code, which looks very similar to your outline in the question, however I am adding here for others to test. It is the subject of a Pull Request https://github.com/mstamy2/PyPDF2/pull/440 and here is the full updated sequence as described by Kevin M Loeffler in https://kevinmloeffler.com/2018/07/08/how-to-extract-pdf-file-attachments-using-python-and-pypdf2/
Viewable at https://gist.github.com/kevinl95/29a9e18d474eb6e23372074deff2df38#file-extract_pdf_attachments-py
Download as
https://gist.github.com/kevinl95/29a9e18d474eb6e23372074deff2df38/raw/acdc194058f9fa2c4d2619a4c623d0efeec32555/extract_pdf_attachments.py
It always helps if you can provide an example input of the type you have problems with so that others can adapt the extraction routine to suit.
In response to getting an error
"I’m guessing the script is breaking because the embedded files section of the PDF doesn’t always exist so trying to access it throws an error."
"Something I would try is to put everything after the ‘catalog’ line in the get_attachments method in a try-catch."
Unfortunately there are many pending pull requests not included into PyPDF2 https://github.com/mstamy2/PyPDF2/pulls and others may also be relevant or needed to aid with this and other shortcomings. Thus you need to see if any of those may also help.
For one pending example of a try catch that you might be able to include / and adapt for other use cases see https://github.com/mstamy2/PyPDF2/pull/551/commits/9d52ef517319b538f007669631ba6b778f8ec3a3
Associated keywords for imbedded files apart from /Type/EmbeddedFiles include /Type /Filespec & /Subtype /FileAttachment note the pairs may not always have spaces so perhaps see if those can be interrogated for the attachments
Again on that last point the example searches for /EmbeddedFiles as indexed in the plural whilst any individual entry itself is identified as singular
This can be improved but it was tested to work (using PyMuPDF).
It detects corrupted PDF files, encryption, attachments, annotations and portfolios.
I am yet to compare the output with our internal classification.
Produces a semicolon separated file that can be imported into Excel.
import fitz # = PyMuPDF
import os
outfile = open("C:/Users/me/Downloads/testPDF3.txt", "w", encoding="utf-8")
folder = "C:/Users/me/Downloads"
print ("filepath;","encrypted;","pages;", "embedded;","attachments;","annotations;","portfolio", file = outfile)
enc=pages=count=names=annots=collection=''
for subdir, dirs, files in os.walk(folder):
for file in files:
#print (os.path.join(subdir, file))
filepath = subdir + os.sep + file
if filepath.endswith(".pdf"):
#print (filepath, file = outfile)
try:
doc = fitz.open(filepath)
enc = doc.is_encrypted
#print("Encrypted? ", enc, file = outfile)
pages = doc.page_count
#print("Number of pages: ", pages, file = outfile)
count = doc.embfile_count()
#print("Number of embedded files:", count, file = outfile) # shows number of embedded files
names = doc.embfile_names()
#print("Embedded files:", str(names), file = outfile)
#if count > 0:
# for emb in names:
# print(doc.embfile_info(emb), file = outfile)
annots = doc.has_annots()
#print("Has annots?", annots, file = outfile)
links = doc.has_links()
#print("Has links?", links, file = outfile)
trailer = doc.pdf_trailer()
#print("Trailer: ", trailer, file = outfile)
xreflen = doc.xref_length() # length of objects table
for xref in range(1, xreflen): # skip item 0!
#print("", file = outfile)
#print("object %i (stream: %s)" % (xref, doc.is_stream(xref)), file = outfile)
#print(doc.xref_object(i, compressed=False), file = outfile)
if "Collection" in doc.xref_object(xref, compressed=False):
#print ("Portfolio", file = outfile)
collection ='True'
break
else: collection="False"
#print(doc.xref_object(xref, compressed=False), file = outfile)
except:
#print ("Not a valid PDF", file = outfile)
enc=pages=count=names=annots=collection="Not a valid PDF"
print(filepath,";", enc,";",pages, ";",count, ";",names, ";",annots, ";",collection, file = outfile )
outfile.close()
I was also running into the same problem with several pdfs that I have. I was able to make these changes to the referenced code that got it to work for me:
import PyPDF2
def getAttachments(reader):
"""
Retrieves the file attachments of the PDF as a dictionary of file names
and the file data as a bytestring.
:return: dictionary of filenames and bytestrings
"""
attachments = {}
#First, get those that are pdf attachments
catalog = reader.trailer["/Root"]
if "/EmbeddedFiles" in catalog["/Names"]:
fileNames = catalog['/Names']['/EmbeddedFiles']['/Names']
for f in fileNames:
if isinstance(f, str):
name = f
dataIndex = fileNames.index(f) + 1
fDict = fileNames[dataIndex].getObject()
fData = fDict['/EF']['/F'].getData()
attachments[name] = fData
#Next, go through all pages and all annotations to those pages
#to find any attached files
for pagenum in range(0, reader.getNumPages()):
page_object = reader.getPage(pagenum)
if "/Annots" in page_object:
for annot in page_object['/Annots']:
annotobj = annot.getObject()
if annotobj['/Subtype'] == '/FileAttachment':
fileobj = annotobj["/FS"]
attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].getData()
return attachments
handler = open(filename, 'rb')
reader = PyPDF2.PdfFileReader(handler)
dictionary = getAttachments(reader)
for fName, fData in dictionary.items():
with open(fName, 'wb') as outfile:
outfile.write(fData)
I know it is a late reply, but i only started looking into this yesterday. I have used the PyMuPdf library to extract the embedded files. here is my code:
import os
import fitz
def get_embedded_pdfs(input_pdf_path, output_path=None):
input_path = "/".join(input_pdf_path.split('/')[:-1])
if not output_path :
output_path = input_pdf_path.split(".")[0] + "_embeded_files/"
if output_path not in os.listdir(input_path):
os.mkdir(output_path)
doc = fitz.open(input_pdf_path)
item_name_dict = {}
for each_item in doc.embfile_names():
item_name_dict[each_item] = doc.embfile_info(each_item)["filename"]
for item_name, file_name in item_name_dict.items():
out_pdf = output_path + file_name
## get embeded_file in bytes
fData = doc.embeddedFileGet(item_name)
## save embeded file
with open(out_pdf, 'wb') as outfile:
outfile.write(fData)
disclaimer: I am the author of borb (the library used in this answer)
borb is an open-source, pure Python PDF library. It abstracts away most of the unpleasantness of dealing with PDF (such as having to deal with dictionaries and having to know PDF-syntax and structure).
There is a huge repository of examples, containing a section on dealing with embedded files, which you can find here.
I'll repeat the relevant example here for completeness:
import typing
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF
def main():
# read the Document
doc: typing.Optional[Document] = None
with open("output.pdf", "rb") as pdf_file_handle:
doc = PDF.loads(pdf_file_handle)
# check whether we have read a Document
assert doc is not None
# retrieve all embedded files and their bytes
for k, v in doc.get_embedded_files().items():
# display the file name, and the size
print("%s, %d bytes" % (k, len(v)))
if __name__ == "__main__":
main()
After the Document has been read, you can simply ask it for a dict mapping the filenames unto the bytes.

CombiningPDF's on filename

Hello I wrote this code to extract a list of PDF names.it** then compares to the input of where the PDF data is stored.The current code merges everything in that folder that is in the pdf data i commented out the part where it checks if the filename "starts with" because it's only then merges one of the documents and not all that are specified.
import json
from PyPDF2 import PdfFileMerger
import os
input = 'pdfdata'
list = os.listdir(input)
with open('pdf.json','r') as jsonfile:
jason_info =json.load(jsonfile)
json_data = [jason_info]
for item in json_data:
for data_item in item['info']:
jobjects = (data_item.values())#return only values
informa = ''.join(ll)#string
filemerger = PdfFileMerger()
for file in dlist:
# if file.startswith(informa):
filemerger.append(file)
filemerger.write("combinedpdfs.pdf")
filemerger.close()
Does this do what you want it to?
filemerger = PdfFileMerger()
for item in json_data:
for dicitonary in item['documents']: # retrieve key & value
for pdf in dictionary.values():
if pdf in dlist:
filemerger.append(pdf)
else:
raise ValueError(f'Pdf file {pdf} missing!')
filemerger.write("combinedpdfs.pdf")
filemerger.close()
I moved the filemerger line out of the for block as otherwise you're constantly resetting it. I also removed the for loop which goes through dlist as you can just check if the pdf file is in dlist.

urllib urlretrieve only saving final image in list of urls

I'm fairly new to using Python. I have been trying to set up a very basic web scraper to help speed up my workday, it is supposed to download images from a section of a website and save them.
I have a list of urls and I am trying to use urllib.request.urlretrieve to download all the images.
The output location (savepath) updates so it adds 1 to the current highest number in the folder.
I've tried a bunch of different ways but urlretrieve only saves the image from the last url in the list. Is there a way to download all the images in the url list?
to_download=['url1','url2','url3','url4']
for t in to_download:
urllib.request.urlretrieve(t, savepath)
This is the code I was trying to use to update the savepath every time
def getNextFilePath(photos):
highest_num = 0
for f in os.listdir(photos):
if os.path.isfile(os.path.join(photos, f)):
file_name = os.path.splitext(f)[0]
try:
file_num = int(file_name)
if file_num > highest_num:
highest_num = file_num
except ValueError:
'The file name "%s" is not an integer. Skipping' % file_name
output_file = os.path.join(output_folder, str(highest_num + 1))
return output_file
as suggested by #vks, you need to update savepath (otherwise you save each url onto the same file). One way to do so, is to use enumerate:
from urllib import request
to_download=['https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/']
for i, url in enumerate(to_download):
save_path = f'website_{i}.txt'
print(save_path)
request.urlretrieve(url, save_path)
which you may want to contract into:
from urllib import request
to_download=['https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/']
[request.urlretrieve(url, f'website_{i}.txt') for i, url in enumerate(to_download)]
see:
Python3 doc: Python enumerate doc
Example of enumerate: enumerate example
Example of f' using a string with a {variable}': f string example
FOR SECOND PART OF THE QUESTION:
Not sure what you are trying to achieve but:
def getNextFilePath(photos):
file_list = os.listdir(photos)
file_list = [int(s) for s in file_list if s.isdigit()]
print(file_list)
max_id_file = max(file_list)
print(f'max id:{max_id_file}')
output_file = os.path.join(output_folder, str(max_id_file + 1))
print(f'output file path:{output_file}')
return output_file
this will hopefully find all files that are named with digits (IDs), and find the highest ID, and return a new file name as a max_id+1
I guess that this will replace the save_path in your example.
Which quickly coding, AND MODIFYING above function, so that it returns the max_id and not the path.
The bellow code be a working example using the iterrator:
import os
from urllib import request
photo_folder = os.path.curdir
def getNextFilePath(photos):
file_list = os.listdir(photos)
print(file_list)
file_list = [int(os.path.splitext(s)[0]) for s in file_list if os.path.splitext(s)[0].isdigit()]
if not file_list:
return 0
print(file_list)
max_id_file = max(file_list)
#print(f'max id:{max_id_file}')
#output_file = os.path.join(photo_folder, str(max_id_file + 1))
#print(f'output file path:{output_file}')
return max_id_file
def download_pic(to_download):
start_id = getNextFilePath(photo_folder)
for i, url in enumerate(to_download):
save_path = f'{i+start_id}.png'
output_file = os.path.join(photo_folder, save_path)
print(output_file)
request.urlretrieve(url, output_file)
You should add handling exception etc, but this seems to be working, if I understood correctly.
Are you updating savepath? If you pass the same savepath to each loop iteration, it is likely just overwriting the same file over and over.
Hope that helps, happy coding!

BeautifulSoup - scraping a forum page

I'm trying to scrape a forum discussion and export it as a csv file, with rows such as "thread title", "user", and "post", where the latter is the actual forum post from each individual.
I'm a complete beginner with Python and BeautifulSoup so I'm having a really hard time with this!
My current problem is that all the text is split into one character per row in the csv file. Is there anyone out there who can help me out? It would be fantastic if someone could give me a hand!
Here's the code I've been using:
from bs4 import BeautifulSoup
import csv
import urllib2
f = urllib2.urlopen("https://silkroad5v7dywlc.onion.to/index.php?action=printpage;topic=28536.0")
soup = BeautifulSoup(f)
b = soup.get_text().encode("utf-8").strip() #the posts contain non-ascii words, so I had to do this
writer = csv.writer(open('silkroad.csv', 'w'))
writer.writerows(b)
Ok here we go. Not quite sure what I'm helping you do here, but hopefully you have a good reason to be analyzing silk road posts.
You have a few issues here, the big one is that you aren't parsing the data at all. What you're essentially doing with .get_text() is going to the page, highlighting the whole thing, and then copying and pasting the whole thing to a csv file.
So here is what you should be trying to do:
Read the page source
Use soup to break it into sections you want
Save sections in parallel arrays for author, date, time, post, etc
Write data to csv file row by row
I wrote some code to show you what that looks like, it should do the job:
from bs4 import BeautifulSoup
import csv
import urllib2
# get page source and create a BeautifulSoup object based on it
print "Reading page..."
page = urllib2.urlopen("https://silkroad5v7dywlc.onion.to/index.php?action=printpage;topic=28536.0")
soup = BeautifulSoup(page)
# if you look at the HTML all the titles, dates,
# and authors are stored inside of <dt ...> tags
metaData = soup.find_all("dt")
# likewise the post data is stored
# under <dd ...>
postData = soup.find_all("dd")
# define where we will store info
titles = []
authors = []
times = []
posts = []
# now we iterate through the metaData and parse it
# into titles, authors, and dates
print "Parsing data..."
for html in metaData:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
titles.append(text.split("Title:")[1].split("Post by:")[0].strip()) # get Title:
authors.append(text.split("Post by:")[1].split(" on ")[0].strip()) # get Post by:
times.append(text.split(" on ")[1].strip()) # get date
# now we go through the actual post data and extract it
for post in postData:
posts.append(BeautifulSoup(str(post)).get_text().encode("utf-8").strip())
# now we write data to csv file
# ***csv files MUST be opened with the 'b' flag***
csvfile = open('silkroad.csv', 'wb')
writer = csv.writer(csvfile)
# create template
writer.writerow(["Time", "Author", "Title", "Post"])
# iterate through and write all the data
for time, author, title, post in zip(times, authors, titles, posts):
writer.writerow([time, author, title, post])
# close file
csvfile.close()
# done
print "Operation completed successfully."
EDIT: Included solution that can read files from directory and use data from that
Okay, so you have your HTML files in a directory. You need to get a list of files in the directory, iterate through them, and append to your csv file for each file in the directory.
This is the basic logic of our new program.
If we had a function called processData() that took a file path as an argument and appended data from the file to your csv file here is what it would look like:
# the directory where we have all our HTML files
dir = "myDir"
# our csv file
csvFile = "silkroad.csv"
# insert the column titles to csv
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Time", "Author", "Title", "Post"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # increment counter
As it happens our processData() function is more or less what we did before, with a few changes.
So this is very similar to our last program, with a few small changes:
We write the column headers first thing
Following that we open the csv with the 'ab' flag to append
We import os to get a list of files
Here's what that looks like:
from bs4 import BeautifulSoup
import csv
import urllib2
import os # added this import to process files/dirs
# ** define our data processing function
def processData( pageFile ):
''' take the data from an html file and append to our csv file '''
f = open(pageFile, "r")
page = f.read()
f.close()
soup = BeautifulSoup(page)
# if you look at the HTML all the titles, dates,
# and authors are stored inside of <dt ...> tags
metaData = soup.find_all("dt")
# likewise the post data is stored
# under <dd ...>
postData = soup.find_all("dd")
# define where we will store info
titles = []
authors = []
times = []
posts = []
# now we iterate through the metaData and parse it
# into titles, authors, and dates
for html in metaData:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
titles.append(text.split("Title:")[1].split("Post by:")[0].strip()) # get Title:
authors.append(text.split("Post by:")[1].split(" on ")[0].strip()) # get Post by:
times.append(text.split(" on ")[1].strip()) # get date
# now we go through the actual post data and extract it
for post in postData:
posts.append(BeautifulSoup(str(post)).get_text().encode("utf-8").strip())
# now we write data to csv file
# ***csv files MUST be opened with the 'b' flag***
csvfile = open('silkroad.csv', 'ab')
writer = csv.writer(csvfile)
# iterate through and write all the data
for time, author, title, post in zip(times, authors, titles, posts):
writer.writerow([time, author, title, post])
# close file
csvfile.close()
# ** start our process of going through files
# the directory where we have all our HTML files
dir = "myDir"
# our csv file
csvFile = "silkroad.csv"
# insert the column titles to csv
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Time", "Author", "Title", "Post"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # incriment counter

Categories

Resources