I got the following problem. I wrote a simple "TextBasedBrowser" (if you can even call it browser at this point :D). The website scraping and parsing with BS4 works great so far, but the its formatted like shit and pretty much unreadable. As soon as I try to use the prettify() method from BS4 it throws me an AttributeError. I searched quite a while on google but couldnt find anything. This is my Code (prettify() method is commented out there):
from bs4 import BeautifulSoup
import requests
import sys
import os
legal_html_tags = ['p', 'a', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']
saved_pages = []
def search_url(url):
saved_pages.append(url.rstrip(".com"))
url = requests.get(f'https://{url}')
return url.text
def parse_html(html_page):
final_text = ""
soup = BeautifulSoup(html_page, 'html.parser')
# soup = soup.prettify()
plain_text = soup.find_all(text=True)
for t in plain_text:
if t.parent.name in legal_html_tags:
final_text += '{} '.format(t)
return final_text
def save_webpage(url, tb_dir):
with open(f'{tb_dir}/{url.rstrip(".com")}.txt', 'w', encoding="utf-8") as tab:
tab.write(parse_html(search_url(url)))
def check_url(url):
if url.endswith(".com") or url.endswith(".org") or url.endswith(".net"):
return True
else:
return False
args = sys.argv
directory = args[1]
try:
os.mkdir(directory)
except FileExistsError:
print("Error: File already exists")
while True:
url_ = input()
if url_ == "exit":
break
elif url_ in saved_pages:
with open(f'{directory}/{url_}.txt', 'r', encoding="utf-8") as curr_page:
print(curr_page.read())
elif not check_url(url_):
print("Error: Invalid URL")
else:
save_webpage(url_, directory)
print(parse_html(search_url(url_)))
And this is the Error:
Traceback (most recent call last):
File "browser.py", line 56, in <module>
save_webpage(url_, directory)
File "browser.py", line 29, in save_webpage
tab.write(parse_html(search_url(url)))
File "browser.py", line 20, in parse_html
plain_text = soup.find_all(text=True)
AttributeError: 'str' object has no attribute 'find_all'
If I include the encoding parameter in the prettify() method it throws me 'bytes' instead of 'str' object.
You have re-assigned the soup variable into a string using the .prettify() method
soup = soup.prettify()
find_all() is a method for soup objects only
You should call find_all(text = True) first and extract all html tags with text, then you perform string operations.
prettify turns your parsed HTML object into a string, so you can’t call find_all on it. Maybe you just want to return soup.prettify()?
This might be what you want:
def parse_html(html_page):
final_text = ""
soup = BeautifulSoup(html_page, 'html.parser')
plain_text = soup.find_all(text=True)
for t in plain_text:
if t.parent.name in legal_html_tags:
final_text += t.prettify() + " "
return final_text
Related
I want to do a search using keywords from a file in a loop. using Selenium and BeatifulSoup
read 1st. row, put the value of it (one keyword) into the search query area, and search, when done, use the 2nd row from the file, and so on.
the read file part does print all keywords, one on each row, but I am not sure how to put it into the search query area, one at a time.
def SearchFuncs():
driver.get('https://www.website.com/search/?q=pet%20care') #put the value from one row on search/?q=
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
#print(a['title'])
return a
#SearchFuncs()
x = SearchFuncs()
print(x ['title'])
# read file sction:
with open ("kw-to-search.txt", "r") as f:
for line in f:
print(line.strip())
Updated: I also added save the result to file
but I tested the codes without save to file section
this is the code I tried using one of the solution (broderick) provided, thank you broderick, I don't get any output, and neither any error:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import time
def SearchFuncs(addr):
driver.get(addr)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
#return a
#print(a ['title'])
with open ("kw.txt", "r") as f:
for line in f:
addr_to_search = 'https://www.website.com/search/?q='
# Build search query from lines
pieces = line.split()
query = ''
for i in range(len(pieces) - 1):
query += (pieces[i] + '%20')
query += pieces[-1]
# Debugging print
print(query)
addr_to_search += query
SearchFuncs(addr_to_search)
textList = a['title']
outF = open("keyword_result.txt", 'a')
for line in textList:
# write line to output file
outF.write(line)
#outF.write("\n")
outF.write(textList + '\n')
outF.close()
Updated with another code
This is another variation Arthur Pereira provided, thank you, Arthur Pereira
def SearchFuncs(url):
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
return a
#y = SearchFuncs(url)
#print(y ['title'])
#print(a['title'])
textList = a['title']
outF = open("Keyword_results-2.txt", 'a')
for line in textList:
# write line to output file
outF.write(line)
#outF.write("\n")
outF.write(textList + '\n')
outF.close()
with open("kw.txt", "r") as f:
for line in f:
query = line.strip().replace(" ", "%20")
url = "https://www.website.com/search/?q=" + query
SearchFuncs(url)
Error:
Traceback (most recent call last):
File "c:/Users/mycomp/Desktop/Python/test/Test-Search-on-Pin-fromList-1.py", line 45, in <module>
SearchFuncs(url)
File "c:/Users/mycomp/Desktop/Python/test/Test-Search-on-Pin-fromList-1.py", line 31, in SearchFuncs
textList = a['title']
TypeError: list indices must be integers or slices, not str
Iterate over each line in your text and prepare it to search. Then pass this url to your search function as a parameter:
Also I think you misuderstand the concept of return. Here your code is just returning the first a element and nothing should happen after it, leaving the function.
for a in soup.select('div.Eqh.F6l.Jea.k1A.zI7.iyn.Hsu a'):
return a
The error you are getting is beacuse it's not finding anything with your select, so it tries to create a list with a string as index:
textList = a['title']
So, assuming you want to get the text inside each anchor element you have to find the correct div and jup into the a element. Then you can get the title and write to a file.
def SearchFuncs(url):
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
collection = soup.select('div.Collection-Item a')
for item in collection:
title = item['title'].strip()
with open("Keyword_results-2.txt", 'a', encoding="utf-8") as outF:
outF.write(title + '\n') # write line to output file
with open("kw.txt", "r") as f:
for line in f:
query = line.strip().replace(" ", "%20")
url = "https://www.pinterest.com/search/pins/?q=" + query
SearchFuncs(url)
Try
def SearchFuncs(addr):
driver.get(addr)
...
and
with open ("kw-to-search.txt", "r") as f:
for line in f:
addr_to_search = 'https://www.website.com/search/?q='
# Build search query from lines
pieces = line.split()
query = ''
for i in range(len(pieces) - 1):
query += (pieces[i] + '%20')
query += pieces[-1]
# Debugging print
print(query)
addr_to_search += query
SearchFuncs(addr_to_search)
I have this kind of code
URL = r"C:\Users\jarze\PycharmProjects\CSV-modifier\venv\html.html"
html_report_part1 = open(URL, 'r', encoding="UTF-8").read()
soup = BeautifulSoup(html_report_part1, "html.parser")
and its return this kind of error:
During handling of the above exception, another exception occurred:
MemoryError
Try:
URL = r"C:\Users\jarze\PycharmProjects\CSV-modifier\venv\html.html"
html_report_part1 = open(URL, 'r', encoding="UTF-8")
html_text = ''
for line in html_report_part1.readlines():
html_text += line
soup = BeautifulSoup(html_text, "html.parser")
I hope you're well. Could you please tell why I can use my scraping script properly :)
It works with other website. I'm a beginner so I probably made a basic mistake
import requests
from bs4 import BeautifulSoup
import time
import csv
links = []
for i in range(1):
url = '*******/recettes/?page={}' + str(i)
res = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'})
response = requests.get(url)
print(response)
if response.ok:
print('Page: ' + str(i))
soup = BeautifulSoup(response.text, "html.parser")
divs = soup.findAll('div', class_ = 'field-item even')
for div in divs:
a = div.find('a')
link = a['href']
links.append('*******' + link)
time.sleep(3)
print(len(links))
with open('urls3.txt', 'w') as file:
for link in links:
file.write(link + '\n')
"""
with open('urls3.txt', 'r') as inf:
with open('recipes3.csv', 'w') as outf:
outf.write('titre,image,url,\n')
for row in inf:
url = row.strip()
response = requests.get(url)
if response.ok:
soup = BeautifulSoup(response.text, "html.parser")
titre = soup.find('h1')
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
print(titre.text, image, url)
outf.write(str(titre.text) + ',' + str(image) + ',' + str(url) + '\n')
time.sleep(1)
"""
Could you please tell me why there is a mistake here :
<Response [200]>
Page: 0
Traceback (most recent call last):
File "ex3.py", line 18, in <module>
link = a['href']
TypeError: 'NoneType' object is not subscriptable
I've found the answer, I post it here :) for anyone interested
try:
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
except Exception as e:
image = None
I get the following error when trying to parse a large number of web pages from a website : "Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)'. How can I fix it ?
full error message is :
File "main.py", line 29, in <module>
records = p.map(defs.scrape,state_urls)
File "C:\Users\Utilisateur\Anaconda3\lib\multiprocessing\pool.py", line 266, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\Utilisateur\Anaconda3\lib\multiprocessing\pool.py", line 644, in get
raise self._value
multiprocessing.pool.MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x0000018DD1C3D828>'. Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)'
I browsed through some of the answers for similar questions here, namely this one (multiprocessing.pool.MaybeEncodingError: Error sending result: Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)') but I don't think I'm running into the same issue, as I don't handle files directly in the scrape function.
I tried modifying the scrape function so it returned a string and not a list (don't know why I did that) but I didn't work.
From the main.py file :
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from multiprocessing import Pool
import codecs
import defs
if __name__ == '__main__':
filename = "some_courts_test.csv"
# not the actual values
courts = ["blabla", "blablabla", "blablabla","blabla"]
client = defs.init_client()
i = 1
# scrapes the data from the website and puts it into a csv file
for court in courts:
records = []
records_string =""
print("creating a file for the court of : "+court)
f = defs.init_court_file(court)
print("generating urls for the court of "+court)
state_urls = defs.generate_state_urls(court)
for url in state_urls:
print(url)
print("scraping creditors from : "+court)
p = Pool(10)
records = p.map(defs.scrape,state_urls)
records_string = ''.join(records[1])
p.terminate()
p.join()
for r in records_string:
f.write(r)
records = []
f.close()
from the defs file:
def scrape(url):
data = []
row_string = ' '
final_data = []
final_string = ' '
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
table = page_soup.find("table", {"class":"table table-striped"})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.replace(',',' ') for ele in cols] #cleans it up
for ele in cols:
if ele:
data.append(ele)
data.append(',')
data.append('\n')
return(data)
I have a log file which contains:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
http://www.downloadray.com/windows/Photos_and_Images/Graphic_Capture/
http://www.downloadray.com/windows/Photos_and_Images/Digital_Photo_Tools/
I have this code:
from bs4 import BeautifulSoup
import urllib
import urlparse
f = open("downloadray2.txt")
g = open("downloadray3.txt", "w")
for line in f.readlines():
i = 1
while 1:
url = line+"?page=%d" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
has_more = 1
for a in soup.select("div.n_head2 a[href]"):
try:
print (a["href"])
g.write(a["href"]+"\n")
except:
print "no link"
if has_more:
i += 1
else:
break
This code do not give error but it do not working.
I tried modified it but can't solved it.
But when I try this code,it works well:
from bs4 import BeautifulSoup
import urllib
import urlparse
g = open("downloadray3.txt", "w")
url = "http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/"
pageUrl = urllib.urlopen(url)
soup = BeautifulSoup(pageUrl)
i = 1
while 1:
url1 = url+"?page=%d" % i
pageHtml = urllib.urlopen(url1)
soup = BeautifulSoup(pageHtml)
has_more = 2
for a in soup.select("div.n_head2 a[href]"):
try:
print (a["href"])
g.write(a["href"]+"\n")
except:
print "no link"
if has_more:
i += 1
else:
break
So how can I make it can read from the log text file. It is hard to take link one by one to be read.
Have you stripped the newline from the end of the line?
for line in f.readlines():
line = line.strip()
readlines() will produce a list of lines taken from the file including the newline \n character.
Proof Evidence by printing url variable (after the line url = line+"?page=%d" % i):
Your original code:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=1
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=2
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=3
With my suggested fix:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=1
http://www.downloadray.com/TIFF-to-JPG_download/
http://www.downloadray.com/Moo0-Image-Thumbnailer_download/
http://www.downloadray.com/Moo0-Image-Sizer_download/
http://www.downloadray.com/Advanced-Image-Viewer-and-Converter_download/
http://www.downloadray.com/GandMIC_download/
http://www.downloadray.com/SendTo-Convert_download/
http://www.downloadray.com/PNG-To-JPG-Converter-Software_download/
http://www.downloadray.com/Graphics-Converter-Pro_download/
http://www.downloadray.com/PICtoC_download/
http://www.downloadray.com/Free-Images-Converter_download/
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=2
http://www.downloadray.com/VarieDrop_download/
http://www.downloadray.com/Tinuous_download/
http://www.downloadray.com/Acme-CAD-Converter_download/
http://www.downloadray.com/AAOImageConverterandFTP_download/
http://www.downloadray.com/ImageCool-Converter_download/
http://www.downloadray.com/GeoJpeg_download/
http://www.downloadray.com/Android-Resizer-Tool_download/
http://www.downloadray.com/Scarab-Darkroom_download/
http://www.downloadray.com/Jpeg-Resizer_download/
http://www.downloadray.com/TIFF2PDF_download/
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=3
http://www.downloadray.com/JGraphite_download/
http://www.downloadray.com/Easy-PNG-to-Icon-Converter_download/
http://www.downloadray.com/JBatch-It!_download/
http://www.downloadray.com/Batch-It!-Pro_download/
http://www.downloadray.com/Batch-It!-Ultra_download/
http://www.downloadray.com/Image-to-Ico-Converter_download/
http://www.downloadray.com/PSD-To-PNG-Converter-Software_download/
http://www.downloadray.com/VectorNow_download/
http://www.downloadray.com/KeitiklImages_download/
http://www.downloadray.com/STOIK-Smart-Resizer_download/
Update:
Then again, this code won't run as expected, because the while loop will never continue since the has_more variable is never changed.
You know that you don't have more links when the list returned by `soup.select(...)` is empty. You can check for emptiness using `len(...)`. So that part might go like this:
list_of_links = soup.select("div.n_head2 a[href]")
if len(list_of_links)==0:
break
else:
for a in soup.select("div.n_head2 a[href]"):
print (a["href"])
g.write(a["href"]+"\n")
i += 1
Apparently the page still display the latest page available if it's queried beyond the maximum page. So if the maximum page number available is 82 and you query page 83, it will give page 82. To detect this case, you can save the list of previous page urls, and compare it with current list of urls.
Here is the full code (tested):
from bs4 import BeautifulSoup
import urllib
import urlparse
f = open("downloadray2.txt")
g = open("downloadray3.txt", "w")
for line in f.readlines():
line = line.strip()
i = 1
prev_urls = []
while 1:
url = line+"?page=%d" % i
print 'Examining %s' % url
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
list_of_urls = soup.select("div.n_head2 a[href]")
if set(prev_urls)==set(list_of_urls):
break
else:
for a in soup.select("div.n_head2 a[href]"):
print (a["href"])
g.write(a["href"]+"\n")
i += 1
prev_urls = list_of_urls