PyPDF3 Merger limit workaround

PyPDF3 Merger limit workaround - python

I am trying to merge 1000+ pdf pages, and it works with under 750 pages. If I open more than 750 it processes it, but output file is 0 bytes.
from PyPDF3 import PdfFileWriter, PdfFileReader, PdfFileMerger
import os
import sys
from collections import OrderedDict
import win32file
win32file._setmaxstdio(8192)
print(win32file._getmaxstdio())
sys.setrecursionlimit(30000)
nameOfFile = os.path.basename(os.getcwd())
#get page number
def getPageNr(arg1):
stro = str(arg1)
stro=stro.replace('.pdf', '')
listR = stro.split(' - ')
listR[len(listR)-1] = listR[len(listR)-1].replace('-','')
listR[len(listR)-1] = listR[len(listR)-1].replace('Page ','')
pgNr=int(listR[len(listR)-1])
return pgNr
currentFolder = os.getcwd()
pdffiles = [os.path.join(name)
for root, dirs, files in os.walk(currentFolder)
for name in files
if name.endswith((".pdf"))]
#create dictionary and get whole list
di={}
#direct copy and create key from page number on back and value is original list
for string in pdffiles:
di.setdefault(getPageNr(string),str(string))
#sort it by keys
di2 = OrderedDict(sorted(di.items()))
pdffiles.clear()
for key,values in di2.items():
pdffiles.append(values)
#put a correction
pageAt = 0
adder = 421
pageAt = pageAt + adder
#add global variables for page in bookmark
mainTitlePage = 0
secondTitlePage = 0
thirdTitlePage = 0
#define globals for bookmarks
mainTitle = ''
SecondTitle = ''
thirdTitle = ''
#define previous bookmarks
lastMainTitle = ''
lastSecondTitle = ''
lastThirdTitle = ''
#if main title is same as next page
isSame = True
#start Merger
editer = PdfFileMerger()
#start main loop
while pageAt<(adder+2000) and pageAt<len(pdffiles) and isSame:
#break filename to titles
titles = pdffiles[pageAt].split(' - ')
#break next page for titles
titlesNext = pdffiles[pageAt+1].split(' - ')
#get titles
mainTitle = titles[0]
secondTitle = titles[1]
if not titlesNext[0] == mainTitle:
isSame = False
hasThird = False
if len(titles)>4:
thirdTitle = titles[2]
hasThird = True
else:
thirdTitle = None
hasThird = False
#open individual page
kStream = open(pdffiles[pageAt], 'rb')
inputK = PdfFileReader(kStream)
#test if titles are changing
if not mainTitle == lastMainTitle:
KmainParent = editer.addBookmark(mainTitle, 0)
if not secondTitle == lastSecondTitle:
secondTitlePage = pageAt-adder
#print(secondTitle)
Kparent = editer.addBookmark(secondTitle, secondTitlePage, KmainParent)
if hasThird:
if not thirdTitle == lastThirdTitle:
thirdTitlePage = pageAt-adder
Mparent = editer.addBookmark(thirdTitle, thirdTitlePage, Kparent)
editer.addBookmark(titles[3], pageAt-adder, Mparent)
else:
editer.addBookmark(titles[2], pageAt-adder, Kparent)
#merge page with fixed bookmarks
editer.merge((pageAt - adder), inputK)
#get titles and save them for future
lastMainTitle = mainTitle
lastSecondTitle = secondTitle
lastThirdTitle = thirdTitle
#go to next page
pageAt += 1
#get name for output file
nameOfFile = mainTitle + '.pdf'
print('Saving ' + nameOfFile)
#start new file and export it
outR = open(nameOfFile, 'wb')
editer.write(outR)
outR.close()
kStream.close()
Now it puts all bookmarks, no problem there. But how to process more than 750 pages.
I have increased recursion limit and maxstdio...but if there are 1000 or more pages, merged file is 0 bytes, but process takes minute or two, so it is processing.
I do not get any of errors.
Can anybody help me to process more than 500 pages

Related

problem duplicating first pdf when combing multiple fillable pdf files with python

I'm trying to combine 40+ fillable pdf into one pdf. Each pdf has one page and they are the same form with different data. I followed the script from (PyPDF2 PdfFileMerger loosing PDF module in merged file) to merge pdf, but the merged output duplicates the first file 45 times (for 45 files). Does anyone know what the issue could be and what is the solution? Thank you in advance!
first i define a function to fill forms:
def set_need_appearances_writer(writer):
try:
catalog = writer._root_object
# get the AcroForm tree and add "/NeedAppearances attribute
if "/AcroForm" not in catalog:
writer._root_object.update({
NameObject("/AcroForm"): IndirectObject(len(writer._objects), 0, writer)})
need_appearances = NameObject("/NeedAppearances")
writer._root_object["/AcroForm"][need_appearances] = BooleanObject(True)
except Exception as e:
print('set_need_appearances_writer() catch : ', repr(e))
return writer
def AYX_PDF_form_fill(i, template, outfile):
try:
field_dictionary = df.to_dict('records')[i]
inputStream = open(template, "rb")
pdf_reader = PdfFileReader(inputStream, strict=False)
pdf_writer = PdfFileWriter()
set_need_appearances_writer(pdf_writer)
pdf_writer.addPage(pdf_reader.getPage(0))
pdf_writer.updatePageFormFieldValues(pdf_writer.getPage(0), field_dictionary)
#set_annotation_flag_writer(pdf_writer, field_dictionary)
outputStream = open(outfile, "wb")
#pdf_writer.encrypt(userPWD, ownerPWD, use_128bit=True)
pdf_writer.write(outputStream)
inputStream.close()
outputStream.close()
except Exception as e:
print('AYX_PDF_form_fill() catch : ', repr(e))
return
Then I call the function to fill forms:
template = 'input/template.pdf'
for i in range(0,len(df)):
outfile = os.path.join('output/pdf',"%i.pdf" % i)
AYX_PDF_form_fill(i, template, outfile)
next I define a function to merge pdfs:
from pdfrw import PdfReader, PdfWriter, PdfName
def merge_pdf_files_pdfrw(pdf_files, output_filename):
output = PdfWriter()
num = 0
output_acroform = None
for pdf in pdf_files:
input = PdfReader(pdf,verbose=False)
output.addpages(input.pages)
if PdfName('AcroForm') in input[PdfName('Root')].keys(): # Not all PDFs have an AcroForm node
source_acroform = input[PdfName('Root')][PdfName('AcroForm')]
if PdfName('Fields') in source_acroform:
output_formfields = source_acroform[PdfName('Fields')]
else:
output_formfields = []
num2 = 0
for form_field in output_formfields:
key = PdfName('T')
old_name = form_field[key].replace('(','').replace(')','') # Field names are in the "(name)" format
form_field[key] = 'FILE_{n}_FIELD_{m}_{on}'.format(n=num, m=num2, on=old_name)
num2 += 1
if output_acroform == None:
# copy the first AcroForm node
output_acroform = source_acroform
else:
for key in source_acroform.keys():
# Add new AcroForms keys if output_acroform already existing
if key not in output_acroform:
output_acroform[key] = source_acroform[key]
# Add missing font entries in /DR node of source file
if (PdfName('DR') in source_acroform.keys()) and (PdfName('Font') in source_acroform[PdfName('DR')].keys()):
if PdfName('Font') not in output_acroform[PdfName('DR')].keys():
# if output_acroform is missing entirely the /Font node under an existing /DR, simply add it
output_acroform[PdfName('DR')][PdfName('Font')] = source_acroform[PdfName('DR')][PdfName('Font')]
else:
# else add new fonts only
for font_key in source_acroform[PdfName('DR')][PdfName('Font')].keys():
if font_key not in output_acroform[PdfName('DR')][PdfName('Font')]:
output_acroform[PdfName('DR')][PdfName('Font')][font_key] = source_acroform[PdfName('DR')][PdfName('Font')][font_key]
if PdfName('Fields') not in output_acroform:
output_acroform[PdfName('Fields')] = output_formfields
else:
# Add new fields
output_acroform[PdfName('Fields')] += output_formfields
num +=1
output.trailer[PdfName('Root')][PdfName('AcroForm')] = output_acroform
output.write(output_filename)
then I call the function.
# the list contains 45 files
pdf_files=['output/pdf/sample1.pdf','output/pdf/sample2.pdf',....]
merged='output/pdf/merged.pdf'
merge_pdf_files_pdfrw(pdf_files,merged)
the merged output file has 45 pages duplicating the information from the first file.

Pyvis graph wont stop moving

I'm trying to make a project where I create a graph from a python project.
I have this code
import os
import sys
import re
import networkx as nx
from pyvis.physics import Physics
from radon.visitors import ComplexityVisitor
from pyvis.network import Network
rootDir ="/home/ask/Git/Zeeguu-API"
depth = int(sys.argv[1])
class directory:
def __init__(self,path, ParentDir = None,ChildrenDirs = [] , PyChildren = []) -> None:
self.path = path
self.parentDir = ParentDir
self.pyChildren = ChildrenDirs
self.pyChildren = PyChildren
def getComplexityoffile(file : str):
f = open(file, "r")
s = f.read()
return ComplexityVisitor.from_code(s).total_complexity
def getParentOfDir(dir: str):
cutlast = dir.split("/")[:-1]
join = "/".join(cutlast)
if join:
return join
else:
return "./"
def extract_importandClass_from_line(unline):
x = re.search("^import (\S+)", unline)
x = re.search("^from (\S+)", unline)
return x.group(1)#, c.group(1).split('(')[0]
def getimportsforfile(file):
lines = [line for line in open(file)]
classes = []
all_imports = []
for line in lines:
try:
imports = extract_importandClass_from_line(line)
tmp = imports.rsplit('.',1)
importEnd = tmp[-1]
# importsimports
importsFormatted = imports.replace('.', '/')
finalimport = importsFormatted[1:] if importsFormatted.startswith('/') else importsFormatted
all_imports.append(importsFormatted)
except:
continue
return all_imports
NodesAndComplexity = {} # (node/complexity in folder)
# ting jeg vil bruge til at holdestyr på dependencies
Map_Dirs_And_Files_To_Displaybledirs = {}
pythonFile_to_imports = {} # (Fille importing, file/dir imported)
dirsForDisplay = set()
# mapping files to parent directories
parenDirToChildDir = {} # (parent, [list of children])
G = nx.DiGraph()
isRoot = True
for root, dirs, files in os.walk(rootDir):
pyfiles = list(filter(lambda a : a.endswith('.py'), files))
thisDir = root.replace(rootDir, '')
splitDIR = thisDir[1:].split("/")[:depth]
if not isRoot:
displayableDir = "/" + "/".join(splitDIR)
else:
displayableDir = "./"
isRoot = False
# if there is python files on this directory
referentialDIr = thisDir[1:] if thisDir.startswith('/') else thisDir
Map_Dirs_And_Files_To_Displaybledirs[referentialDIr] = displayableDir
if (pyfiles):
accumulateComplexity = 0
for f in pyfiles:
filepath = root + "/"+ f
imports = getimportsforfile(filepath)
logFile = thisDir + "/" + f[:-3]
accumulateComplexity = accumulateComplexity + getComplexityoffile(filepath)
removedslashFromLogfile = logFile[1:] if logFile.startswith('/') else logFile
Map_Dirs_And_Files_To_Displaybledirs[removedslashFromLogfile] = displayableDir
pythonFile_to_imports[removedslashFromLogfile] = imports
if displayableDir not in NodesAndComplexity:
NodesAndComplexity[displayableDir] = accumulateComplexity
else:
NodesAndComplexity[displayableDir] = NodesAndComplexity[displayableDir] + accumulateComplexity
if (displayableDir not in dirsForDisplay):
dirsForDisplay.add(thisDir)
G.add_node(displayableDir, Physics=False)
if not isRoot and displayableDir != "./":
parent = getParentOfDir(displayableDir)
G.add_edge(parent, displayableDir)
# setting node sizes
for importingfile, importlist in pythonFile_to_imports.items():
for importt in importlist:
if importt in Map_Dirs_And_Files_To_Displaybledirs:
fromf = Map_Dirs_And_Files_To_Displaybledirs[importingfile]
to = Map_Dirs_And_Files_To_Displaybledirs[importt]
if fromf != to:
G.add_edge(Map_Dirs_And_Files_To_Displaybledirs[importingfile],Map_Dirs_And_Files_To_Displaybledirs[importt], color="red")
for node, complexity in NodesAndComplexity.items():
complexixtyDisplay = complexity / 2
G.nodes[node]["size"] = complexixtyDisplay
Displayer = Network(directed=True, height="1500px", width="100%")
Displayer.from_nx(G)
Displayer.barnes_hut(overlap=1)
Displayer.show_buttons(filter_=["physics"])
Displayer.show("pik.html")
This creates the graph just fine. However, when I create it, the graph is flying around my screen, and it is impossible to actually get a looks at it.
If I remove Displayer.barnes_hut(overlap=1), then it doesnt move, but then the nodes are all just bunched up on top of eachother, and again it is impossible to decipher the graph.
How do I get a graph that is both standing (reasonably) still and readable?

In the show_buttons function add all the buttons, and after creating the pik.html file, open the html file in Google Chrome. In the buttons option
there will be font category, there you can disable the physics option.
From then on the nodes will not move and you can distribute the nodes as you want by moving them.

Is there a faster, more efficient way to process my data?

Further to a post I made a couple of weeks ago, I'm reading rows from a spreadsheet (nearly 215,000) and attempting to match them with text files contained in in a sub-directory. On average the number of text files files contained in the sub-directory is 14000. Although my code is working, it is taking an inordinate amount of time to copy the matched files to a second sub-directory. At this rate it's going to be end of August before the job is complete (average processing time is six hours)
Is there a way to improve the efficiency of this algorithm, or indeed is there a better way? My code is below
regards
import glob
import os,sys
import csv
import shutil
import pandas as pd
import fnmatch
import string
import xlrd
from os import listdir
from os.path import isfile
MDA_Path = 'D:/1994_QTR3' # contains Loughram and MacDonald 10-K files for QTR3
MDA_Path_2 = 'D:/1994_QTR4' # Contains L&M 10-K files for QTR4
MDA_Path_3 = 'D:/1995_QTR1'
MDA_Path_4 = 'D:/1995_QTR2'
MDA_Path_5 = 'D:/1995_QTR3'
MDA_Path_6 = 'D:/1995_QTR4'
MDA_Path_7 = 'D:/1996_QTR1'
MDA_Path_8 = 'D:/1996_QTR2'
MDA_Path_9 = 'D:/1996_QTR3'
MDA_Path_10 = 'D:/1996_QTR4'
MDA_Path_11 = 'D:/1997_QTR1'
MDA_Path_12 = 'D:/1997_QTR2'
MDA_Path_13 = 'D:/1997_QTR3'
MDA_Path_14 = 'D:/1997_QTR4'
MDA_Path_15 = 'D:/1998/QTR1'
MDA_Path_16 = 'D:/1998/QTR2'
MDA_Path_17 = 'D:/1998/QTR3'
MDA_Path_18 = 'D:/1998/QTR4'
MDA_Path_19 = 'D:/1999/QTR1'
MDA_Path_20 = 'D:/1999/QTR2'
MDA_Path_21 = 'D:/1999/QTR3'
MDA_Path_22 = 'D:/1999/QTR4'
MDA_Path_23 = 'D:/2000/QTR1'
MDA_Path_24 = 'D:/2000/QTR2'
MDA_Path_25 = 'D:/2000/QTR3'
MDA_Path_26 = 'D:/2000/QTR4'
MDA_Path_27 = 'D:/2001/QTR1'
MDA_Path_28 = 'D:/2001/QTR2'
MDA_Path_29 = 'D:/2001/QTR3'
MDA_Path_30 = 'D:/2001/QTR4'
MDA_Path_31 = 'D:/2002/QTR1'
MDA_Path_32 = 'D:/2002/QTR2'
MDA_Path_33 = 'D:/2002/QTR3'
MDA_Path_34 = 'D:/2002/QTR4'
MDA_Target_List = r'D:/PhD_Data/Wenrui_Filing_list' # stores wenruis data
MDA_For_Parsing_1994_QTR3 = 'D:/Required_MDA_1994_QTR3' # will hold all 10-Ks from wenrui's spreadsheet once detected
MDA_For_Parsing_1994_QTR4 = 'D:/Required_MDA_1994_QTR4'
MDA_For_Parsing_1995_QTR1 = 'D:/Required_MDA_1995_QTR1'
MDA_For_Parsing_1995_QTR2 = 'D:/Required_MDA_1995_QTR2'
MDA_For_Parsing_1995_QTR3 = 'D:/Required_MDA_1995_QTR3'
MDA_For_Parsing_1995_QTR4 = 'D:/Required_MDA_1995_QTR4'
MDA_For_Parsing_1996_QTR1 = 'D:/Required_MDA_1996_QTR1'
MDA_For_Parsing_1996_QTR2 = 'D:/Required_MDA_1996_QTR2'
MDA_For_Parsing_1996_QTR3 = 'D:/Required_MDA_1996_QTR3'
MDA_For_Parsing_1996_QTR4 = 'D:/Required_MDA_1996_QTR4'
MDA_For_Parsing_1997_QTR1 = 'D:/Required_MDA_1997_QTR1'
MDA_For_Parsing_1997_QTR2 = 'D:/Required_MDA_1997_QTR2'
MDA_For_Parsing_1997_QTR3 = 'D:/Required_MDA_1997_QTR3'
MDA_For_Parsing_1997_QTR4 = 'D:/Required_MDA_1997_QTR4'
MDA_For_Parsing_1998_QTR1 = 'D:/Required_MDA_1998_QTR1'
MDA_For_Parsing_1998_QTR2 = 'D:/Required_MDA_1998_QTR2'
MDA_For_Parsing_1998_QTR3 = 'D:/Required_MDA_1998_QTR3'
MDA_For_Parsing_1998_QTR4 = 'D:/Required_MDA_1998_QTR4'
MDA_For_Parsing_1999_QTR1 = 'D:/Required_MDA_1999_QTR1'
MDA_For_Parsing_1999_QTR2 = 'D:/Required_MDA_1999_QTR2'
MDA_For_Parsing_1999_QTR3 = 'D:/Required_MDA_1999_QTR3'
MDA_For_Parsing_1999_QTR4 = 'D:/Required_MDA_1999_QTR4'
MDA_For_Parsing_2000_QTR1 = 'D:/Required_MDA_2000_QTR1'
MDA_For_Parsing_2000_QTR2 = 'D:/Required_MDA_2000_QTR2'
MDA_For_Parsing_2000_QTR3 = 'D:/Required_MDA_2000_QTR3'
MDA_For_Parsing_2000_QTR4 = 'D:/Required_MDA_2000_QTR4'
MDA_For_Parsing_2001_QTR1 = 'D:/Required_MDA_2001_QTR1'
MDA_For_Parsing_2001_QTR2 = 'D:/Required_MDA_2001_QTR2'
MDA_For_Parsing_2001_QTR3 = 'D:/Required_MDA_2001_QTR3'
MDA_For_Parsing_2001_QTR4 = 'D:/Required_MDA_2001_QTR4'
MDA_FOR_Parsing_2002_QTR1 = 'D:/Required_MDA_2002_QTR1'
MDA_FOR_Parsing_2002_QTR2 = 'D:/Required_MDA_2002_QTR2'
MDA_FOR_Parsing_2002_QTR3 = 'D:/Required_MDA_2002_QTR3'
MDA_FOR_Parsing_2002_QTR4 = 'D:/Required_MDA_2002_QTR4'
# open the csv file and extract the column containing the location of the text file(s)
datas = pd.read_excel(r'D:/PhD_Data/Wenrui_Filing_list/1994-2017filingslist_Wenrui_13Jul2020.xlsx')
df = pd.DataFrame(datas, columns = ['FILE_NAME']) # extract the data contained in FILE_NAME column
df['FILE_NAME'] = df['FILE_NAME'].str[26:] # remove the first 26 characters which contain the edgar drive info
df['FILE_NAME'] = df['FILE_NAME'].str.strip() # remove all leading and trailing
file_length = len(df) # count number of files in Wenrui's list (will need this later to loop through all occurrences)
dirs = os.listdir(MDA_Path_32)
# dirs1 = os.listdir(MDA_Path_3)
for x in range(file_length):
for file in dirs:
# if file == df['FILE_NAME'][x]:
if df['FILE_NAME'][x] in file:
print(file)
shutil.copy(MDA_Path_32 + '/' + file, MDA_FOR_Parsing_2002_QTR2) # Move it to QTR directory```

Processing multiple files and write a csv file for each

I wrote a code that works fine for single file, but I have to change the names for each file. It reads a pickle file, write it into a txt file, then does some process on the context of txt file and produce a list of numbers, at the end stores the list in a dataframe and write that dataframe in csv file.
def get_value_of_list(bit_list):
p_number = 0
for i in bit_list:
if i == 1:
p_number = p_number + 1
return p_number
def cross_entropy(p, q):
return -sum([p[i] * log2(q[i]) for i in range(len(p))])
if __name__ == "__main__":
file_name = 'pickleData_AIMchat2.txt'
pickle_file = 'AIMchat2.pickle'
pk = PickleToFile(file_name, pickle_file)
pk.create_pickle_file()
h = HexToBinary(file_name)
hex_list = h.read_file()
num_of_bits = 8
scale = 16
bin_data = []
for i in hex_list:
bin_data.append(bin(int(i, scale))[2:].zfill(num_of_bits))
my_bit_list = []
for byte in bin_data:
bit_list = []
for bit in byte:
bit_list.append(int(bit))
num_of_one_divided_by_eight = get_value_of_list(bit_list) / 8
my_bit_list.append(num_of_one_divided_by_eight)
cross_entropy_list = []
i = 0
while i < len(my_bit_list):
cross = cross_entropy([my_bit_list[i]], [my_bit_list[i + 1]])
cross_entropy_list.append(cross)
i = i + 2
df = pd.DataFrame(cross_entropy_list)
df.to_csv(r'AIMchat2.csv', index=False, index_label=False, chunksize=1000000, header=False)
I have changed create_pickle_file() to the code below to read files in the directory:
class PickleToFile:
def __init__(self, name, pickle_file):
self.name = name
self.pickle_file = pickle_file
def create_pickle_file(self):
basepath = Path()
files_in_basepath = basepath.iterdir('pickle/')
for item in files_in_basepath:
if item.is_file():
checkThePickle = open(self.pickle_file, "rb")
with open(self.name, 'w') as filehandler:
for listItem in checkThePickle:
filehandler.write('%s\n' % listItem)
But since after reading file it writes it to a text file and then a csv file, I don't know how to do that. Appreciate any suggestions.

If you are looking to get a list of files in directory and process them, this should get you what you want:
How do I list all files of a directory?
Once you have this list of files, do a loop:
for each in list_of_files:
process_function(each)
Then, you are on your way, where 'process_function' is the function, and the argument is the filename.

Extracting strong text and following p's

I have written a code to extract a div (see below), but now i would like to show all the "strong" in one column and the following text in a different column (for multiple files in a directory). In dropbox i uploaded an example: (https://www.dropbox.com/s/kbnal2pefih2ru4/test.html?dl=0).
My code till this far is:
import textwrap
import os
from bs4 import BeautifulSoup
directory ='C:/Research syntheses - Meta analysis/SeekingAlpha/Tests/'
for filename in os.listdir(directory):
if filename.endswith('.html'):
fname = os.path.join(directory,filename)
with open(fname, 'r') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div',class_='content_part hid', id='article_qanda')
print(filename, participants)
So my output would need to be: in column 1 all the strongs and in column 2 the following p (sometime more than one). I hope someone can help me!

You can loop through all the partecipants and save a temporary array with the columns of each rows. Then you can display them as you wish. This is an example:
import textwrap
import os
from bs4 import BeautifulSoup
fname = "test.html"
with open(fname, 'r') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div',class_='content_part hid', id='article_qanda')
n=-1
rows = []
for p in participants:
name = p.find("strong")
if name is not None and str(name) != "-1":
n = n + 1
rows.append([name.text])
elif name is None:
rows[n].append(p.text)
# now print all the rows
for r in rows:
if len(r) > 1:
# here you can display them as you wish.
# r[0] contains the "strong" tag
# r[1] contains the next "p" tag
print("%s => %s" % (r[0], r[1]))
else:
# here you have only the "strong" tag
print(r[0])
Edit:
I removed class_='content_part hid', from the soup.find, removed one loop and added the multiprocess part, you can find info about multiprocess here:
import os
from bs4 import BeautifulSoup
import multiprocessing as mp
def process(filename):
if filename.endswith('.html'):
fname = os.path.join(directory,filename)
with open(fname,errors='ignore') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div', id='article_qanda')
if not participants:
return
for p in participants:
name = p.find("strong")
if name is not None and str(name) != "-1":
print()
print(name.text + " => ", end='')
elif name is None:
print(p.text, end=' ')
directory ='.'
if __name__ == '__main__':
p = mp.Pool()
p.map(process, os.listdir(directory))

Using the code of #rxw, i have edit his answer further in my final solution:
import textwrap
import os
from bs4 import BeautifulSoup
import pandas as pd
import textwrap
import os
from bs4 import BeautifulSoup
directory ='C:/Research syntheses - Meta analysis/Transcripts'
for filename in os.listdir(directory):
if filename.endswith('.html'):
fname = os.path.join(directory,filename)
with open(fname,errors='ignore') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div',class_='content_part hid', id='article_qanda')
if not participants: continue
n=-1
rows = []
for p in participants:
name = p.find("strong")
if name is not None and str(name) != "-1":
n = n + 1
rows.append([name.text])
elif name is None:
rows[n].append(p.text)
# now print all the rows
for r in rows:
if len(r) > 1:
# here you can display them as you wish.
# r[0] contains the "strong" tag
# r[1] contains the next "p" tag
print("%s => %s" % (r[0], r[1]))
else:
# here you have only the "strong" tag
print(r[0])

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

PyPDF3 Merger limit workaround - python

Related

problem duplicating first pdf when combing multiple fillable pdf files with python

Pyvis graph wont stop moving

Is there a faster, more efficient way to process my data?

Processing multiple files and write a csv file for each

Extracting strong text and following p's

Categories

Resources