How can i handle memory error on html file? - python

I have this kind of code
URL = r"C:\Users\jarze\PycharmProjects\CSV-modifier\venv\html.html"
html_report_part1 = open(URL, 'r', encoding="UTF-8").read()
soup = BeautifulSoup(html_report_part1, "html.parser")
and its return this kind of error:
During handling of the above exception, another exception occurred:
MemoryError

Try:
URL = r"C:\Users\jarze\PycharmProjects\CSV-modifier\venv\html.html"
html_report_part1 = open(URL, 'r', encoding="UTF-8")
html_text = ''
for line in html_report_part1.readlines():
html_text += line
soup = BeautifulSoup(html_text, "html.parser")

Related

How do i store output of this function to somewhere

I'm trying to scrape url for elements to have them be outputted to a file but it doesn't seem to work -The function runs fine to have them printed
def get_price():
page = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(page.content, 'lxml')
while True:
try:
product_title = soup.find("span",{'id': "productTitle"}).text.strip()
#price = float((soup.find("span",{'class': "a-offscreen"}).text.split("₹")[1]).replace(',',""))
f = open("Textfile.txt", "w")
f.write(product_title)
f.close()
break
except AttributeError:
get_price()
break
get_price()

'HTTPResponse' object is not callable

Im trying to open a file using python,for storing the article extracted from url.
import urllib.request
from bs4 import BeautifulSoup
url = "http://www.thedrum.com/news/2015/07/29/mankind-must-get-ahead-technical-development-states-phds-mark-holden-following"
openfile = urlopen(url)
soup = BeautifulSoup(openfile)
f = open("test1.txt", "w")
for data in soup.find_all("p"):
sum1 = data.get_text()
f.writelines(sum1)
f.close()
But it shows the TypeError: 'HTTPResponse' object is not callablepointing to f = open("test1.txt", "w")
How do I solve this?
You need to make the soup object from the html content (that you get from the read function), not the response object
with urllib.request.urlopen(url) as r, open("test1.txt", "w") as f:
soup = BeautifulSoup(r.read().decode('utf-8'))
for data in soup.find_all("p"):
sum1 = data.get_text()
f.writelines(sum1)

Strange Error in Python using BeautifulSoup Prettify method

I got the following problem. I wrote a simple "TextBasedBrowser" (if you can even call it browser at this point :D). The website scraping and parsing with BS4 works great so far, but the its formatted like shit and pretty much unreadable. As soon as I try to use the prettify() method from BS4 it throws me an AttributeError. I searched quite a while on google but couldnt find anything. This is my Code (prettify() method is commented out there):
from bs4 import BeautifulSoup
import requests
import sys
import os
legal_html_tags = ['p', 'a', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']
saved_pages = []
def search_url(url):
saved_pages.append(url.rstrip(".com"))
url = requests.get(f'https://{url}')
return url.text
def parse_html(html_page):
final_text = ""
soup = BeautifulSoup(html_page, 'html.parser')
# soup = soup.prettify()
plain_text = soup.find_all(text=True)
for t in plain_text:
if t.parent.name in legal_html_tags:
final_text += '{} '.format(t)
return final_text
def save_webpage(url, tb_dir):
with open(f'{tb_dir}/{url.rstrip(".com")}.txt', 'w', encoding="utf-8") as tab:
tab.write(parse_html(search_url(url)))
def check_url(url):
if url.endswith(".com") or url.endswith(".org") or url.endswith(".net"):
return True
else:
return False
args = sys.argv
directory = args[1]
try:
os.mkdir(directory)
except FileExistsError:
print("Error: File already exists")
while True:
url_ = input()
if url_ == "exit":
break
elif url_ in saved_pages:
with open(f'{directory}/{url_}.txt', 'r', encoding="utf-8") as curr_page:
print(curr_page.read())
elif not check_url(url_):
print("Error: Invalid URL")
else:
save_webpage(url_, directory)
print(parse_html(search_url(url_)))
And this is the Error:
Traceback (most recent call last):
File "browser.py", line 56, in <module>
save_webpage(url_, directory)
File "browser.py", line 29, in save_webpage
tab.write(parse_html(search_url(url)))
File "browser.py", line 20, in parse_html
plain_text = soup.find_all(text=True)
AttributeError: 'str' object has no attribute 'find_all'
If I include the encoding parameter in the prettify() method it throws me 'bytes' instead of 'str' object.
You have re-assigned the soup variable into a string using the .prettify() method
soup = soup.prettify()
find_all() is a method for soup objects only
You should call find_all(text = True) first and extract all html tags with text, then you perform string operations.
prettify turns your parsed HTML object into a string, so you can’t call find_all on it. Maybe you just want to return soup.prettify()?
This might be what you want:
def parse_html(html_page):
final_text = ""
soup = BeautifulSoup(html_page, 'html.parser')
plain_text = soup.find_all(text=True)
for t in plain_text:
if t.parent.name in legal_html_tags:
final_text += t.prettify() + " "
return final_text

Python scraping bs4 TypeError: 'NoneType' object is not subscriptable

I hope you're well. Could you please tell why I can use my scraping script properly :)
It works with other website. I'm a beginner so I probably made a basic mistake
import requests
from bs4 import BeautifulSoup
import time
import csv
links = []
for i in range(1):
url = '*******/recettes/?page={}' + str(i)
res = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'})
response = requests.get(url)
print(response)
if response.ok:
print('Page: ' + str(i))
soup = BeautifulSoup(response.text, "html.parser")
divs = soup.findAll('div', class_ = 'field-item even')
for div in divs:
a = div.find('a')
link = a['href']
links.append('*******' + link)
time.sleep(3)
print(len(links))
with open('urls3.txt', 'w') as file:
for link in links:
file.write(link + '\n')
"""
with open('urls3.txt', 'r') as inf:
with open('recipes3.csv', 'w') as outf:
outf.write('titre,image,url,\n')
for row in inf:
url = row.strip()
response = requests.get(url)
if response.ok:
soup = BeautifulSoup(response.text, "html.parser")
titre = soup.find('h1')
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
print(titre.text, image, url)
outf.write(str(titre.text) + ',' + str(image) + ',' + str(url) + '\n')
time.sleep(1)
"""
Could you please tell me why there is a mistake here :
<Response [200]>
Page: 0
Traceback (most recent call last):
File "ex3.py", line 18, in <module>
link = a['href']
TypeError: 'NoneType' object is not subscriptable
I've found the answer, I post it here :) for anyone interested
try:
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
except Exception as e:
image = None

Writing scraped data to csv

I've been stuck trying to transfer the data that I scraped to a csv file. Here is my code:
import requests, bs4, csv, sys
reload(sys)
sys.setdefaultencoding('utf-8')
url = 'http://www.constructeursdefrance.com/resultat/?dpt=01'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text,'html.parser')
links = []
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)
the output says:
Traceback (most recent call last):
File "test_new_project.py", line 23, in <module>
writer.writerows(data)
csv.Error: sequence expected
But I am trying to put tuples into the csv file, and as long as I know csv accepts tuples and lists. How can I fix this problem?
Atirag is correct, but you have another issue which is that your with call to open the output file is nested within a for loop. So if there is more than one link, the file will be overwritten each time and the output will not be what you expect. I think this should generate the output you intend:
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
with open("french.csv", "w") as file:
writer = csv.writer(file)
for i in links:
res2 = requests.get(i)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
writer.writerow([each.text, each.next_sibling])
Change this
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
to this
data=[]
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
your data variable was one tuple not a list of tuples. The above code creates a list of tuples.
Other solution is this (mind the indentation)
data = []
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)

Categories

Resources