Exception has occurred: AttributeError 'str' object has no attribute 'descendants' - python

I'm new at using python and I'm trying to make a web scraper for an internship
from typing import Container
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
p1 = ["https://www.libris.ro/search?iv.q={}", "https://carturesti.ro/product/search/{}", "https://www.elefant.ro/search?SearchTerm={}&StockAvailability=true", "https://www.litera.ro/catalogsearch/result/?q{}", "https://www.librariadelfin.ro/?submitted=1&O=search&keywords{}&do_submit=1", "https://bookzone.ro/cautare?term={}", "https://www.librex.ro/search/{}/?q={}"]
#price_min = 1000000
#url_min, price_min
title = "percy jackson"
for x in p1:
temp = x
title = title.replace(" ", "+")
url = temp.format(title)
if url == "https://www.libris.ro/search?iv.q=" + title :
**books = bs.find_all("div", class_="product-item-info imgdim-x")**
for each_book in books:
book_url = each_book.find("a")["href"]
price = each_book.find("span", class_="price-wrapper")
print(book_url)
print(price)
and I'm getting this error for the text between the 2 asterisk :
Exception has occurred: AttributeError
'str' object has no attribute 'descendants'

After from bs4 import BeautifulSoup as bs, bs is the class. You need to instantiate that class with data from the web site. In the code below, I've add a requests call to get the page and have built the beautifulsoup doc from there. You'll find some other errors in your code that need to be sorted out, but it will get you past this problem.
from typing import Container
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
p1 = ["https://www.libris.ro/search?iv.q={}", "https://carturesti.ro/product/search/{}", "https://www.elefant.ro/search?SearchTerm={}&StockAvailability=true", "https://www.litera.ro/catalogsearch/result/?q{}", "https://www.librariadelfin.ro/?submitted=1&O=search&keywords{}&do_submit=1", "https://bookzone.ro/cautare?term={}", "https://www.librex.ro/search/{}/?q={}"]
#price_min = 1000000
#url_min, price_min
title = "percy jackson"
for x in p1:
temp = x
title = title.replace(" ", "+")
url = temp.format(title)
if url == "https://www.libris.ro/search?iv.q=" + title :
# THE FIX
resp = requests.get(url)
if not 200 <= resp.status_code < 299:
print("failed", resp.status_code, url)
continue
doc = bs(resp.text, "html.parser")
books = doc.find_all("div", class_="product-item-info imgdim-x")
for each_book in books:
book_url = each_book.find("a")["href"]
price = each_book.find("span", class_="price-wrapper")
print(book_url)
print(price)

Related

Web scraping from multiple pages with for loop part 2

My original problem:
"I have created web scraping tool for picking data from listed houses.
I have problem when it comes to changing page. I did make for loop to go from 1 to some number.
Problem is this: In this web pages last "page" can be different all the time. Now it is 70, but tomorrow it can be 68 or 72. And if I but range for example to (1-74) it will print last page many times, because if you go over the maximum the page always loads the last."
Then I got help from Ricco D who wrote code that it will know when to stop:
import requests
from bs4 import BeautifulSoup as bs
url='https://www.etuovi.com/myytavat-asunnot/oulu?haku=M1582971026&sivu=1000'
page=requests.get(url)
soup = bs(page.content,'html.parser')
last_page = None
pages = []
buttons=soup.find_all('button', class_= "Pagination__button__3H2wX")
for button in buttons:
pages.append(button.text)
print(pages)
This works just fine.
Butt when I try to combine this with my original code, which also works by itself I run into error:
Traceback (most recent call last):
File "C:/Users/Käyttäjä/PycharmProjects/Etuoviscaper/etuovi.py", line 29, in <module>
containers = page_soup.find("div", {"class": "ListPage__cardContainer__39dKQ"})
File "C:\Users\Käyttäjä\PycharmProjects\Etuoviscaper\venv\lib\site-packages\bs4\element.py", line 2173, in __getattr__
raise AttributeError(
AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
This is the error I get.
Any ideas how to get this work? Thanks
import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import re
import requests
my_url = 'https://www.etuovi.com/myytavat-asunnot/oulu?haku=M1582971026&sivu=1'
filename = "asunnot.csv"
f = open(filename, "w")
headers = "Neliöt; Hinta; Osoite; Kaupunginosa; Kaupunki; Huoneistoselitelmä; Rakennusvuosi\n"
f.write(headers)
page = requests.get(my_url)
soup = soup(page.content, 'html.parser')
pages = []
buttons = soup.findAll("button", {"class": "Pagination__button__3H2wX"})
for button in buttons:
pages.append(button.text)
last_page = int(pages[-1])
for sivu in range(1, last_page):
req = requests.get(my_url + str(sivu))
page_soup = soup(req.text, "html.parser")
containers = page_soup.findAll("div", {"class": "ListPage__cardContainer__39dKQ"})
for container in containers:
size_list = container.find("div", {"class": "flexboxgrid__col-xs__26GXk flexboxgrid__col-md-4__2DYW-"}).text
size_number = re.findall("\d+\,*\d+", size_list)
size = ''.join(size_number) # Asunnon koko neliöinä
prize_line = container.find("div", {"class": "flexboxgrid__col-xs-5__1-5sb flexboxgrid__col-md-4__2DYW-"}).text
prize_number_list = re.findall("\d+\d+", prize_line)
prize = ''.join(prize_number_list[:2]) # Asunnon hinta
address_city = container.h4.text
address_list = address_city.split(', ')[0:1]
address = ' '.join(address_list) # osoite
city_part = address_city.split(', ')[-2] # kaupunginosa
city = address_city.split(', ')[-1] # kaupunki
type_org = container.h5.text
type = type_org.replace("|", "").replace(",", "").replace(".", "") # asuntotyyppi
year_list = container.find("div", {"class": "flexboxgrid__col-xs-3__3Kf8r flexboxgrid__col-md-4__2DYW-"}).text
year_number = re.findall("\d+", year_list)
year = ' '.join(year_number)
print("pinta-ala: " + size)
print("hinta: " + prize)
print("osoite: " + address)
print("kaupunginosa: " + city_part)
print("kaupunki: " + city)
print("huoneistoselittelmä: " + type)
print("rakennusvuosi: " + year)
f.write(size + ";" + prize + ";" + address + ";" + city_part + ";" + city + ";" + type + ";" + year + "\n")
f.close()
Your main problem has to do with the way you use soup. You first import BeautifulSoup as soup - and then you override this name, when you create your first BeautifulSoup-instance:
soup = soup(page.content, 'html.parser')
From this point on soup will no longer be the name library BeautifulSoup, but the object you just created. Hence, when you some lines further down try to create a new instance (page_soup = soup(req.text, "html.parser")) this fails as soup no longer refers to BeautifulSoup.
So the best thing would be importing the library correctly like so: from bs4 import BeautifulSoup (or import AND use it as bs - like Ricco D did), and then change the two instantiating lines like so:
soup = BeautifulSoup(page.content, 'html.parser') # this is Python2.7-syntax btw
and
page_soup = BeautifulSoup(req.text, "html.parser") # this is Python3-syntax btw
If you're on Python3, the proper requests-syntax would by page.text and not page.content as .content returns bytes in Python3, which is not what you want (as BeautifulSoup needs a str). If you're on Python2.7 you should probably change req.text to req.content.
Good luck.
Finding your element with class name doesn't seem to be the best idea..because of this. Same class name for all the next elements.
I don't know what you are looking for exactly because of the language. I suggest..you go to the website>press f12>press ctrl+f>type the xpath..See what elements you get.If you don't know about xpaths read this. https://blog.scrapinghub.com/2016/10/27/an-introduction-to-xpath-with-examples

AttributeError: 'function' object has no attribute 'read'

I'm getting an error for a program that fetches(searches) data from youtube , and it shows an error AttributeError: 'function' object has no attribute 'read' i am on python3
import urllib.request
from bs4 import BeautifulSoup
import sys
flag = 0
textToSearch = 'hello world'
query = sys.argv[0].strip("\"").replace(" ","+")
url = "https://www.youtube.com/results?search_query=" + query
response = urllib.request.urlopen
html = response.read()
soup = BeautifulSoup(html,"lxml")
for vid in soup.findAll(attrs={'class':'yt-uix-tile-link'}):
if ('https://www.youtube.com' + vid['href']).startswith("https://www.youtube.com/watch?v="):
flag = 1
print ('https://www.youtube.com' + vid['href'])
if flag == 0:
print ("No results found") ```
The mistake has been made here:
response = urllib.request.urlopen
html = response.read()
You put urllib.request.urlopen into response variable instead of the result of calling that function.
So instead of
response = urllib.request.urlopen
you should call it with appropriate parameters:
response = urllib.request.urlopen( .. parameters come here ... )
have you tried using requests library ?
like this:
import requests
from bs4 import BeautifulSoup
import sys
flag = 0
textToSearch = 'hello world'
query = sys.argv[0].strip("\"").replace(" ","+")
url = "https://www.youtube.com/results?search_query=" + query
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html,"lxml")
for vid in soup.findAll(attrs={'class':'yt-uix-tile-link'}):
if ('https://www.youtube.com' + vid['href']).startswith("https://www.youtube.com/watch?v="):
flag = 1
print ('https://www.youtube.com' + vid['href'])
if flag == 0:
print ("No results found")

AttributeError: 'NoneType' object has no attribute 'tbody' - Spyder 3.3.1 / beautifulsoup4 / python 3.6

Hey this is my setup: Spyder 3.3.1 / beautifulsoup4 / python 3.6
The below code is from an article on medium (here) about webscraping with python and Beautifulsoup. Was supposed to be a quick read but now TWO days later I still cant not get the code to run in spyder and keep getting:
File "/Users/xxxxxxx/Documents/testdir/swiftScrape.py", line 9, in table_to_df
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
AttributeError: 'NoneType' object has no attribute 'tbody'
Not sure what is going wrong and seems to be an implementation error. Can anyone assist in sheding some light on this issue.
Thanks in advance.
import os
import bs4
import requests
import pandas as pd
PATH = os.path.join("C:\\","Users","xxxxx","Documents","tesdir")
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=None, sep=';', encoding='iso-8859-1')
url = next_page(soup)
counter += 1
Like a lost of example code found on the web, this code is not production-grade code - it blindly assumes that http requests always succeed and returns the expected content. The truth is that it's quite often not the case (network errors, proxies or firewall that blocks you, site down - temporarily or definitely, updates in the site that changed either the urls and/or the page's markup etc).
Your problem manifests itself here:
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
and comes from table actually being None, which means that here in the for loop:
table = soup.find(name='table', attrs={'id':'tableID'})
there was no "table" tag with id "tableID" found in the html document. You can check this by printing the actual html content:
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continye
# etc
Thanks #bruno desthuilliers for your pointers. Much appreciated.
This is the rewritten code that worked for me using Selenium and webdriver rather than import requests:
import os
import bs4
import pandas as pd
from selenium import webdriver
PATH = os.path.join('/','Users','benmorris','documents','testdir')
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.find_all('td')] for row in soup.find_all('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
driver = webdriver.Chrome()
driver.get(url)
while True:
print(counter)
page = driver.get(url)
soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
table = driver.find_element_by_xpath('//*[#id="tableID"]')
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continue
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=False, sep=',', encoding='iso-8859-1')
url = next_page(soup)
counter += 1

Beautifulsoup python3 Howlongtobeat.com extracting name (and other elements)

Trying to figure out how to extract the name of the game through beautifulsoup
I think i having a problem with the HTML aspect of it
here what I have so far:
from requests import get
url = 'https://howlongtobeat.com/game.php?id=38050'
response = get(url)
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
game_length = html_soup.find_all('div', class_='game_times')
length = (game_length[-1].find_all({'li': ' short time_100 shadow_box'})[-1].contents[3].get_text())
print(length)
game_name = html_soup.find_all('div', class_='profile_header_game')
game = (game_name[].find({"profile_header shadow_text"})[].contents[].get_text())
print(game)
I'm getting the length but not the game name why?
for print(length) prints:
31 Hours
but for print(game) prints:
game_name = html_soup.find_all('div', class_='profile_header_game')
game = (game_name[].find({"profile_header shadow_text"})[].contents[].get_text())
File "", line 1
game = (game_name[].find({"profile_header shadow_text"})[].contents[].get_text())
^
SyntaxError: invalid syntax
print(game)
Traceback (most recent call last):
File "", line 1, in
NameError: name 'game' is not defined
what am I doing wrong?
It looks like there are a few syntax issues in your code. Here is a corrected version:
from bs4 import BeautifulSoup
import requests
url = 'https://howlongtobeat.com/game.php?id=38050'
response = requests.get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
game_times_tag = html_soup.find('div', class_='game_times')
game_time_list = []
for li_tag in game_times_tag.find_all('li'):
title = li_tag.find('h5').text.strip()
play_time = li_tag.find('div').text.strip()
game_time_list.append((title, play_time))
for game_time in game_time_list:
print(game_time)
profile_header_tag = html_soup.find("div", {"class": "profile_header shadow_text"})
game_name = profile_header_tag.text.strip()
print(game_name)
shorter version
game_length = html_soup.select('div.game_times li div')[-1].text
game_name = html_soup.select('div.profile_header')[0].text
developer = html_soup.find_all('strong', string='\nDeveloper:\n')[0].next_sibling

Beautiful Soup error: NameError: name 'htmltext' is not defined

I'm getting this error:
NameError: name 'htmltext' is not defined
It comes from the code below:
from bs4 import BeautifulSoup
import urllib
import urllib.parse
url = "http://nytimes.com"
urls = [url]
visited = [url]
while len(urls) > 0:
try:
htmltext = urllib.urlopen(urls[0]).read()
except:
print(urls[0])
soup = BeautifulSoup(htmltext)
urls.pop(0)
print(soup.findAll('a',href = true))
In Python 3.x, you have to import urllib.request instead of urllib. Then, change the line:
htmltext = urllib.urlopen(urls[0]).read()
to:
htmltext = urllib.request.urlopen(urls[0]).read()
Finally, change true to True.

Categories

Resources