I have this kind of code
URL = r"C:\Users\jarze\PycharmProjects\CSV-modifier\venv\html.html"
html_report_part1 = open(URL, 'r', encoding="UTF-8").read()
soup = BeautifulSoup(html_report_part1, "html.parser")
and its return this kind of error:
During handling of the above exception, another exception occurred:
MemoryError
Try:
URL = r"C:\Users\jarze\PycharmProjects\CSV-modifier\venv\html.html"
html_report_part1 = open(URL, 'r', encoding="UTF-8")
html_text = ''
for line in html_report_part1.readlines():
html_text += line
soup = BeautifulSoup(html_text, "html.parser")
Related
I'm trying to scrape url for elements to have them be outputted to a file but it doesn't seem to work -The function runs fine to have them printed
def get_price():
page = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(page.content, 'lxml')
while True:
try:
product_title = soup.find("span",{'id': "productTitle"}).text.strip()
#price = float((soup.find("span",{'class': "a-offscreen"}).text.split("₹")[1]).replace(',',""))
f = open("Textfile.txt", "w")
f.write(product_title)
f.close()
break
except AttributeError:
get_price()
break
get_price()
Im trying to open a file using python,for storing the article extracted from url.
import urllib.request
from bs4 import BeautifulSoup
url = "http://www.thedrum.com/news/2015/07/29/mankind-must-get-ahead-technical-development-states-phds-mark-holden-following"
openfile = urlopen(url)
soup = BeautifulSoup(openfile)
f = open("test1.txt", "w")
for data in soup.find_all("p"):
sum1 = data.get_text()
f.writelines(sum1)
f.close()
But it shows the TypeError: 'HTTPResponse' object is not callablepointing to f = open("test1.txt", "w")
How do I solve this?
You need to make the soup object from the html content (that you get from the read function), not the response object
with urllib.request.urlopen(url) as r, open("test1.txt", "w") as f:
soup = BeautifulSoup(r.read().decode('utf-8'))
for data in soup.find_all("p"):
sum1 = data.get_text()
f.writelines(sum1)
I got the following problem. I wrote a simple "TextBasedBrowser" (if you can even call it browser at this point :D). The website scraping and parsing with BS4 works great so far, but the its formatted like shit and pretty much unreadable. As soon as I try to use the prettify() method from BS4 it throws me an AttributeError. I searched quite a while on google but couldnt find anything. This is my Code (prettify() method is commented out there):
from bs4 import BeautifulSoup
import requests
import sys
import os
legal_html_tags = ['p', 'a', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']
saved_pages = []
def search_url(url):
saved_pages.append(url.rstrip(".com"))
url = requests.get(f'https://{url}')
return url.text
def parse_html(html_page):
final_text = ""
soup = BeautifulSoup(html_page, 'html.parser')
# soup = soup.prettify()
plain_text = soup.find_all(text=True)
for t in plain_text:
if t.parent.name in legal_html_tags:
final_text += '{} '.format(t)
return final_text
def save_webpage(url, tb_dir):
with open(f'{tb_dir}/{url.rstrip(".com")}.txt', 'w', encoding="utf-8") as tab:
tab.write(parse_html(search_url(url)))
def check_url(url):
if url.endswith(".com") or url.endswith(".org") or url.endswith(".net"):
return True
else:
return False
args = sys.argv
directory = args[1]
try:
os.mkdir(directory)
except FileExistsError:
print("Error: File already exists")
while True:
url_ = input()
if url_ == "exit":
break
elif url_ in saved_pages:
with open(f'{directory}/{url_}.txt', 'r', encoding="utf-8") as curr_page:
print(curr_page.read())
elif not check_url(url_):
print("Error: Invalid URL")
else:
save_webpage(url_, directory)
print(parse_html(search_url(url_)))
And this is the Error:
Traceback (most recent call last):
File "browser.py", line 56, in <module>
save_webpage(url_, directory)
File "browser.py", line 29, in save_webpage
tab.write(parse_html(search_url(url)))
File "browser.py", line 20, in parse_html
plain_text = soup.find_all(text=True)
AttributeError: 'str' object has no attribute 'find_all'
If I include the encoding parameter in the prettify() method it throws me 'bytes' instead of 'str' object.
You have re-assigned the soup variable into a string using the .prettify() method
soup = soup.prettify()
find_all() is a method for soup objects only
You should call find_all(text = True) first and extract all html tags with text, then you perform string operations.
prettify turns your parsed HTML object into a string, so you can’t call find_all on it. Maybe you just want to return soup.prettify()?
This might be what you want:
def parse_html(html_page):
final_text = ""
soup = BeautifulSoup(html_page, 'html.parser')
plain_text = soup.find_all(text=True)
for t in plain_text:
if t.parent.name in legal_html_tags:
final_text += t.prettify() + " "
return final_text
I hope you're well. Could you please tell why I can use my scraping script properly :)
It works with other website. I'm a beginner so I probably made a basic mistake
import requests
from bs4 import BeautifulSoup
import time
import csv
links = []
for i in range(1):
url = '*******/recettes/?page={}' + str(i)
res = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'})
response = requests.get(url)
print(response)
if response.ok:
print('Page: ' + str(i))
soup = BeautifulSoup(response.text, "html.parser")
divs = soup.findAll('div', class_ = 'field-item even')
for div in divs:
a = div.find('a')
link = a['href']
links.append('*******' + link)
time.sleep(3)
print(len(links))
with open('urls3.txt', 'w') as file:
for link in links:
file.write(link + '\n')
"""
with open('urls3.txt', 'r') as inf:
with open('recipes3.csv', 'w') as outf:
outf.write('titre,image,url,\n')
for row in inf:
url = row.strip()
response = requests.get(url)
if response.ok:
soup = BeautifulSoup(response.text, "html.parser")
titre = soup.find('h1')
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
print(titre.text, image, url)
outf.write(str(titre.text) + ',' + str(image) + ',' + str(url) + '\n')
time.sleep(1)
"""
Could you please tell me why there is a mistake here :
<Response [200]>
Page: 0
Traceback (most recent call last):
File "ex3.py", line 18, in <module>
link = a['href']
TypeError: 'NoneType' object is not subscriptable
I've found the answer, I post it here :) for anyone interested
try:
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
except Exception as e:
image = None
I've been stuck trying to transfer the data that I scraped to a csv file. Here is my code:
import requests, bs4, csv, sys
reload(sys)
sys.setdefaultencoding('utf-8')
url = 'http://www.constructeursdefrance.com/resultat/?dpt=01'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text,'html.parser')
links = []
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)
the output says:
Traceback (most recent call last):
File "test_new_project.py", line 23, in <module>
writer.writerows(data)
csv.Error: sequence expected
But I am trying to put tuples into the csv file, and as long as I know csv accepts tuples and lists. How can I fix this problem?
Atirag is correct, but you have another issue which is that your with call to open the output file is nested within a for loop. So if there is more than one link, the file will be overwritten each time and the output will not be what you expect. I think this should generate the output you intend:
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
with open("french.csv", "w") as file:
writer = csv.writer(file)
for i in links:
res2 = requests.get(i)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
writer.writerow([each.text, each.next_sibling])
Change this
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
to this
data=[]
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
your data variable was one tuple not a list of tuples. The above code creates a list of tuples.
Other solution is this (mind the indentation)
data = []
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)