Extracting strong text and following p's - python

I have written a code to extract a div (see below), but now i would like to show all the "strong" in one column and the following text in a different column (for multiple files in a directory). In dropbox i uploaded an example: (https://www.dropbox.com/s/kbnal2pefih2ru4/test.html?dl=0).
My code till this far is:
import textwrap
import os
from bs4 import BeautifulSoup
directory ='C:/Research syntheses - Meta analysis/SeekingAlpha/Tests/'
for filename in os.listdir(directory):
if filename.endswith('.html'):
fname = os.path.join(directory,filename)
with open(fname, 'r') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div',class_='content_part hid', id='article_qanda')
print(filename, participants)
So my output would need to be: in column 1 all the strongs and in column 2 the following p (sometime more than one). I hope someone can help me!

You can loop through all the partecipants and save a temporary array with the columns of each rows. Then you can display them as you wish. This is an example:
import textwrap
import os
from bs4 import BeautifulSoup
fname = "test.html"
with open(fname, 'r') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div',class_='content_part hid', id='article_qanda')
n=-1
rows = []
for p in participants:
name = p.find("strong")
if name is not None and str(name) != "-1":
n = n + 1
rows.append([name.text])
elif name is None:
rows[n].append(p.text)
# now print all the rows
for r in rows:
if len(r) > 1:
# here you can display them as you wish.
# r[0] contains the "strong" tag
# r[1] contains the next "p" tag
print("%s => %s" % (r[0], r[1]))
else:
# here you have only the "strong" tag
print(r[0])
Edit:
I removed class_='content_part hid', from the soup.find, removed one loop and added the multiprocess part, you can find info about multiprocess here:
import os
from bs4 import BeautifulSoup
import multiprocessing as mp
def process(filename):
if filename.endswith('.html'):
fname = os.path.join(directory,filename)
with open(fname,errors='ignore') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div', id='article_qanda')
if not participants:
return
for p in participants:
name = p.find("strong")
if name is not None and str(name) != "-1":
print()
print(name.text + " => ", end='')
elif name is None:
print(p.text, end=' ')
directory ='.'
if __name__ == '__main__':
p = mp.Pool()
p.map(process, os.listdir(directory))

Using the code of #rxw, i have edit his answer further in my final solution:
import textwrap
import os
from bs4 import BeautifulSoup
import pandas as pd
import textwrap
import os
from bs4 import BeautifulSoup
directory ='C:/Research syntheses - Meta analysis/Transcripts'
for filename in os.listdir(directory):
if filename.endswith('.html'):
fname = os.path.join(directory,filename)
with open(fname,errors='ignore') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div',class_='content_part hid', id='article_qanda')
if not participants: continue
n=-1
rows = []
for p in participants:
name = p.find("strong")
if name is not None and str(name) != "-1":
n = n + 1
rows.append([name.text])
elif name is None:
rows[n].append(p.text)
# now print all the rows
for r in rows:
if len(r) > 1:
# here you can display them as you wish.
# r[0] contains the "strong" tag
# r[1] contains the next "p" tag
print("%s => %s" % (r[0], r[1]))
else:
# here you have only the "strong" tag
print(r[0])

Related

Unable to add text file to path in python

I was trying to scrape some data using BeautifulSoup on python from a site which has some products and then store it in text files in separate folders. Here in the given code I am stuck nearing the end of the same. I have added the rest just as a reference.
import unittest, time, random
import urllib.request
import os
from selenium import webdriver
from selenium.common.exceptions import InvalidArgumentException
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import pandas as pd
links = []
soup_list = []
imgs = []
website = "https://www.energystar.gov/productfinder"
rate = [i/10 for i in range(10)]
cnt = 0
quote = '"'
newline = '\n'
colon = ' : '
browser = webdriver.Firefox(executable_path="C:\\Users\\abc\\.wdm\\drivers\\geckodriver\\win64\\v0.29.1\\geckodriver.exe")
url2 = 'https://www.energystar.gov/productfinder/product/certified-room-air-cleaners/results?page_number='
def getdata(url):
browser.get(url)
content = browser.page_source
soup1 = BeautifulSoup(content, "html.parser")
return soup1
#pagenos = ['0','13']
pagenos = []
for i in range(0,2):
pagenos.append(i)
i =+ 1
print(pagenos)
for i in range(0,len(pagenos)):
url = url2 + str(pagenos[i])
soup1 = getdata(url)
soup_list.append(soup1)
for main in soup1.findAll('div', attrs = {'class' : 'row certified-room-air-cleaners'}):
name=main.find('a', href=True)
if (name != ''):
links.append((name.get('href')).strip())
print("Got links : ", len(links))
print("Got soups : ", len(soup_list))
#print('Soup 1:', soup_list[1])
for link in links:
#just for testing 10 links
cnt = cnt + 1
if cnt >= 20:
break
# time delay before we access the next page..
time.sleep(random.choice(rate))
#print("Fetching link..... : ", link)
link = link[5:]
#print("Fetching link..... : ", link)
link = website + link
browser.get(link)
linkcontent = browser.page_source
soup2 = BeautifulSoup(linkcontent, "html.parser")
pmname = soup2.find('div', attrs={'class' : 'l-wrapper'}).find('h1')
if not pmname:
print("Error no product name for link : ", link)
continue
pmname = pmname.text.strip().split(' - ')
bname = pmname[0].strip()
mname = pmname[1].strip()
#print(bname)
#print(mname)
# Creating folder with Brand Name as name
try:
os.makedirs(str(bname))
except FileExistsError:
# directory already exists
pass
# Creating text folders with model number as name
fname = mname + '.txt'
path = '/' + str(bname)
a = os.path.join(path, fname)
print(a)
ff = open(fname, mode='w')
ff.write("BRAND NAME : " + quote + bname + quote + newline)
ff.write("MODEL : " + quote + mname + quote + newline)
browser.close()
exit()
Here I can successfully create the text file but cant seem to add it the the created path.
You are missing something:
If you are adding anything in list then you must have in write mode to write file but you want to append in exist data then file open must have "a" mode to append data at end and before to write any data inside file. file must be open first then close at end of file to close process of file after finish.

Can't write in a CSV file python

I am trying to write data into a csv after scraping using pandas dataframe, but the csv is empty even after program execution. The headers are written first but they are also overwritten when dataframe comes into action.
Here is the code:
from bs4 import BeautifulSoup
import requests
import re as resju
import csv
import pandas as pd
re = requests.get('https://www.farfeshplus.com/Video.asp?ZoneID=297')
soup = BeautifulSoup(re.content, 'html.parser')
links = soup.findAll('a', {'class': 'opacityit'})
links_with_text = [a['href'] for a in links]
headers = ['Name', 'LINK']
# this is output file, u can change the path as you desire, default is the working directory
file = open('data123.csv', 'w', encoding="utf-8")
writer = csv.writer(file)
writer.writerow(headers)
for i in links_with_text:
new_re = requests.get(i)
new_soup = BeautifulSoup(new_re.content, 'html.parser')
m = new_soup.select_one('h1 div')
Name = m.text
print(Name)
n = new_soup.select_one('iframe')
ni = n['src']
iframe = requests.get(ni)
i_soup = BeautifulSoup(iframe.content, 'html.parser')
d_script = i_soup.select_one('body > script')
d_link = d_script.text
mp4 = resju.compile(r"(?<=mp4:\s\[\')(.*)\'\]")
final_link = mp4.findall(d_link)[0]
print(final_link)
df = pd.DataFrame(zip(Name, final_link))
df.to_csv(file, header=None, index=False)
file.close()
df.head() returns:
0 1
0 ل h
1 ي t
2 ل t
3 ى p
4 s
0 1
0 ل h
1 ي t
2 ل t
3 ى p
4 s
Any suggestion ?
It seems you are using a mix of libraries to write to a csv, pandas handles all this nicely so there is no need to use the inbuilt csv module from python -
I've modified your code below - this should return your dataframe as a whole df and write it out as a csv.
also using Headers=None you were setting the columns to nothing, so they would be referenced by an index number.
from bs4 import BeautifulSoup
import requests
import re as resju
#import csv
import pandas as pd
re = requests.get('https://www.farfeshplus.com/Video.asp?ZoneID=297')
soup = BeautifulSoup(re.content, 'html.parser')
links = soup.findAll('a', {'class': 'opacityit'})
links_with_text = [a['href'] for a in links]
names_ = [] # global list to hold all iterable variables from your loops
final_links_ = []
for i in links_with_text:
new_re = requests.get(i)
new_soup = BeautifulSoup(new_re.content, 'html.parser')
m = new_soup.select_one('h1 div')
Name = m.text
names_.append(name) # append to global list.
print(Name)
n = new_soup.select_one('iframe')
ni = n['src']
iframe = requests.get(ni)
i_soup = BeautifulSoup(iframe.content, 'html.parser')
d_script = i_soup.select_one('body > script')
d_link = d_script.text
mp4 = resju.compile(r"(?<=mp4:\s\[\')(.*)\'\]")
final_link = mp4.findall(d_link)[0]
print(final_link)
final_links_.append(final_link) # append to global list.
df = pd.DataFrame(zip(names_, final_links_)) # use global lists.
df.columns = ['Name', 'LINK']
df.to_csv(file, index=False)

PyPDF3 Merger limit workaround

I am trying to merge 1000+ pdf pages, and it works with under 750 pages. If I open more than 750 it processes it, but output file is 0 bytes.
from PyPDF3 import PdfFileWriter, PdfFileReader, PdfFileMerger
import os
import sys
from collections import OrderedDict
import win32file
win32file._setmaxstdio(8192)
print(win32file._getmaxstdio())
sys.setrecursionlimit(30000)
nameOfFile = os.path.basename(os.getcwd())
#get page number
def getPageNr(arg1):
stro = str(arg1)
stro=stro.replace('.pdf', '')
listR = stro.split(' - ')
listR[len(listR)-1] = listR[len(listR)-1].replace('-','')
listR[len(listR)-1] = listR[len(listR)-1].replace('Page ','')
pgNr=int(listR[len(listR)-1])
return pgNr
currentFolder = os.getcwd()
pdffiles = [os.path.join(name)
for root, dirs, files in os.walk(currentFolder)
for name in files
if name.endswith((".pdf"))]
#create dictionary and get whole list
di={}
#direct copy and create key from page number on back and value is original list
for string in pdffiles:
di.setdefault(getPageNr(string),str(string))
#sort it by keys
di2 = OrderedDict(sorted(di.items()))
pdffiles.clear()
for key,values in di2.items():
pdffiles.append(values)
#put a correction
pageAt = 0
adder = 421
pageAt = pageAt + adder
#add global variables for page in bookmark
mainTitlePage = 0
secondTitlePage = 0
thirdTitlePage = 0
#define globals for bookmarks
mainTitle = ''
SecondTitle = ''
thirdTitle = ''
#define previous bookmarks
lastMainTitle = ''
lastSecondTitle = ''
lastThirdTitle = ''
#if main title is same as next page
isSame = True
#start Merger
editer = PdfFileMerger()
#start main loop
while pageAt<(adder+2000) and pageAt<len(pdffiles) and isSame:
#break filename to titles
titles = pdffiles[pageAt].split(' - ')
#break next page for titles
titlesNext = pdffiles[pageAt+1].split(' - ')
#get titles
mainTitle = titles[0]
secondTitle = titles[1]
if not titlesNext[0] == mainTitle:
isSame = False
hasThird = False
if len(titles)>4:
thirdTitle = titles[2]
hasThird = True
else:
thirdTitle = None
hasThird = False
#open individual page
kStream = open(pdffiles[pageAt], 'rb')
inputK = PdfFileReader(kStream)
#test if titles are changing
if not mainTitle == lastMainTitle:
KmainParent = editer.addBookmark(mainTitle, 0)
if not secondTitle == lastSecondTitle:
secondTitlePage = pageAt-adder
#print(secondTitle)
Kparent = editer.addBookmark(secondTitle, secondTitlePage, KmainParent)
if hasThird:
if not thirdTitle == lastThirdTitle:
thirdTitlePage = pageAt-adder
Mparent = editer.addBookmark(thirdTitle, thirdTitlePage, Kparent)
editer.addBookmark(titles[3], pageAt-adder, Mparent)
else:
editer.addBookmark(titles[2], pageAt-adder, Kparent)
#merge page with fixed bookmarks
editer.merge((pageAt - adder), inputK)
#get titles and save them for future
lastMainTitle = mainTitle
lastSecondTitle = secondTitle
lastThirdTitle = thirdTitle
#go to next page
pageAt += 1
#get name for output file
nameOfFile = mainTitle + '.pdf'
print('Saving ' + nameOfFile)
#start new file and export it
outR = open(nameOfFile, 'wb')
editer.write(outR)
outR.close()
kStream.close()
Now it puts all bookmarks, no problem there. But how to process more than 750 pages.
I have increased recursion limit and maxstdio...but if there are 1000 or more pages, merged file is 0 bytes, but process takes minute or two, so it is processing.
I do not get any of errors.
Can anybody help me to process more than 500 pages

Having trouble into saving something to a csv file

My program does all that I want, but is not saving the final data to the csv file, I used a print before it to see if the data was right and it is, It is just not writing to the csv file, I'm using 'a' because I don't want it to rewrite what's already written, but it is still returning an error.
here's the part of the code:
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
#csvline.encode('utf-8')
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(csvline)
Here's the error:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\final.py", line 95, in <module>
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
TypeError: 'encoding' is an invalid keyword argument for this function
Here's the entire program code in case of need
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
#import unicodecsv as csv
import csv
#import pickle
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
answer = browser.response.text
#print(answer)
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(csvline)
EDIT
It's working, here's the code working
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import unicodecsv as csv
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
if not file_name.endswith('.pdf'):
continue
file_path = os.path.join(dir_path, file_name)
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0]
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
answer = browser.response.text
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
my_file = codecs.open('final_output.csv', 'a', 'utf-8')
my_file.write(csvline)
At the end there is a problem with your code
writer = csv.writer(f)
csv.writer(csvline) # here is the problem
See you initialize the writer, but then you don't use it.
writer = csv.writer(f)
writer.writerow(csvline)
Here :
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
csv.writer (csvline)
You are instanciating a csv.writer, but not using it. This should read:
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.write(csvline)
Now there are quite a few other problems with your code, the first one being to manually create the 'csvline as text then using csv.writer to store it to file. csv.writer.write() expects a list of rows (tuples) and takes care of properly escaping what needs to be escaped, inserting the proper delimiters etc. It also has a writerow() method that takes a single tuple and so avoid building the whole list in memory FWIW.

When trying to extract meta data out of images on webpages, keeps returning {}, why?

I've looked at the exifread documentation and it says that it's returned as a dictionary, but the problem is that it returns nothing except {}, I don't know if that means there is no meta data in the image, or I made a nooby mistake, well anyway I've spend a good chunk of time looking at my code and documentation, but still can't find the solution, any help would be appreciated :)
Code:
import exifread
import colorama
import urllib2
import urllib
import random
import time
import bs4
import sys
def get_images(target):
colorama.init()
print(colorama.Fore.LIGHTGREEN_EX + "[*] Retrieving Meta Data from Target's Page...")
req = urllib2.Request(target)
resp = urllib2.urlopen(req)
page = resp.read()
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
if "www" in src or "http" in src or "https" in src:
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(src, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
else:
s = target + src
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(s, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
return
def main():
target = raw_input("Enter the target: ")
print ("\n")
get_images(target)
time.sleep(5)
sys.exit()
if __name__ == "__main__":
main()
The problem is you were not passing a base url, you need to pass the host and then join that to the src unless you get an absolute url from the src attribute.
The following code demonstrates a working example, I used requests in place of urllib but the logic is the same:
import bs4
import sys
import os
import requests
from urlparse import urljoin
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img", src=True):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
tags = exifread.process_file(f,"rb")
print (tags)
def main():
target ="http://www.exiv2.org/sample.html"
# need base to join to relative src
base = "http://www.exiv2.org/"
get_images(target, base)
if __name__ == "__main__":
main()
You will get the exif data for the one image on the page that has some:
A PIL example:
import bs4
import os
import requests
from urlparse import urljoin
import PIL.Image
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
try:
img = PIL.Image.open(f)
exif_data = img._getexif()
print(exif_data)
except AttributeError as e:
print("No exif data for {}".format(name))
os.remove(name)
os.remove(name) will remove files that have no exif data, if you don't want that to happen then remove it.

Categories

Resources