Beautiful Soup error: NameError: name 'htmltext' is not defined - python

I'm getting this error:
NameError: name 'htmltext' is not defined
It comes from the code below:
from bs4 import BeautifulSoup
import urllib
import urllib.parse
url = "http://nytimes.com"
urls = [url]
visited = [url]
while len(urls) > 0:
try:
htmltext = urllib.urlopen(urls[0]).read()
except:
print(urls[0])
soup = BeautifulSoup(htmltext)
urls.pop(0)
print(soup.findAll('a',href = true))

In Python 3.x, you have to import urllib.request instead of urllib. Then, change the line:
htmltext = urllib.urlopen(urls[0]).read()
to:
htmltext = urllib.request.urlopen(urls[0]).read()
Finally, change true to True.

Related

Exception has occurred: AttributeError 'str' object has no attribute 'descendants'

I'm new at using python and I'm trying to make a web scraper for an internship
from typing import Container
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
p1 = ["https://www.libris.ro/search?iv.q={}", "https://carturesti.ro/product/search/{}", "https://www.elefant.ro/search?SearchTerm={}&StockAvailability=true", "https://www.litera.ro/catalogsearch/result/?q{}", "https://www.librariadelfin.ro/?submitted=1&O=search&keywords{}&do_submit=1", "https://bookzone.ro/cautare?term={}", "https://www.librex.ro/search/{}/?q={}"]
#price_min = 1000000
#url_min, price_min
title = "percy jackson"
for x in p1:
temp = x
title = title.replace(" ", "+")
url = temp.format(title)
if url == "https://www.libris.ro/search?iv.q=" + title :
**books = bs.find_all("div", class_="product-item-info imgdim-x")**
for each_book in books:
book_url = each_book.find("a")["href"]
price = each_book.find("span", class_="price-wrapper")
print(book_url)
print(price)
and I'm getting this error for the text between the 2 asterisk :
Exception has occurred: AttributeError
'str' object has no attribute 'descendants'
After from bs4 import BeautifulSoup as bs, bs is the class. You need to instantiate that class with data from the web site. In the code below, I've add a requests call to get the page and have built the beautifulsoup doc from there. You'll find some other errors in your code that need to be sorted out, but it will get you past this problem.
from typing import Container
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
p1 = ["https://www.libris.ro/search?iv.q={}", "https://carturesti.ro/product/search/{}", "https://www.elefant.ro/search?SearchTerm={}&StockAvailability=true", "https://www.litera.ro/catalogsearch/result/?q{}", "https://www.librariadelfin.ro/?submitted=1&O=search&keywords{}&do_submit=1", "https://bookzone.ro/cautare?term={}", "https://www.librex.ro/search/{}/?q={}"]
#price_min = 1000000
#url_min, price_min
title = "percy jackson"
for x in p1:
temp = x
title = title.replace(" ", "+")
url = temp.format(title)
if url == "https://www.libris.ro/search?iv.q=" + title :
# THE FIX
resp = requests.get(url)
if not 200 <= resp.status_code < 299:
print("failed", resp.status_code, url)
continue
doc = bs(resp.text, "html.parser")
books = doc.find_all("div", class_="product-item-info imgdim-x")
for each_book in books:
book_url = each_book.find("a")["href"]
price = each_book.find("span", class_="price-wrapper")
print(book_url)
print(price)

if string contains from list

I want to check if any of the excluded sites show up. I can get it to work with just one site, but as soon as I make it a list, it errors at if donts in thingy:
TypeError: 'in ' requires string as left operand, not tuple"
This is my code:
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if donts in thingy:
pass
else:
print (thingy)
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if thingy in donts :
print (thingy)
else:
pass
Judge: string in tuple
The crux of your problem is how you're searching your excluded list:
excluded = ("a", "b", "c")
links = ["a", "d", "e"]
for site in links:
if site not in excluded: # We want to know if the site is in the excluded list
print(f"Site not excluded: {site}")
Reverse the order of your elements and this should work fine. I've inverted your logic here so you can skip the unnecessary pass.
As a side note, this is one reason clear variable names can help - they will help you reason about what the logic should be doing. Especially in Python where ergonomics like in exist, this is very useful.
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if any(d in thingy for d in donts):
pass
else:
print (thingy)

AttributeError: 'function' object has no attribute 'read'

I'm getting an error for a program that fetches(searches) data from youtube , and it shows an error AttributeError: 'function' object has no attribute 'read' i am on python3
import urllib.request
from bs4 import BeautifulSoup
import sys
flag = 0
textToSearch = 'hello world'
query = sys.argv[0].strip("\"").replace(" ","+")
url = "https://www.youtube.com/results?search_query=" + query
response = urllib.request.urlopen
html = response.read()
soup = BeautifulSoup(html,"lxml")
for vid in soup.findAll(attrs={'class':'yt-uix-tile-link'}):
if ('https://www.youtube.com' + vid['href']).startswith("https://www.youtube.com/watch?v="):
flag = 1
print ('https://www.youtube.com' + vid['href'])
if flag == 0:
print ("No results found") ```
The mistake has been made here:
response = urllib.request.urlopen
html = response.read()
You put urllib.request.urlopen into response variable instead of the result of calling that function.
So instead of
response = urllib.request.urlopen
you should call it with appropriate parameters:
response = urllib.request.urlopen( .. parameters come here ... )
have you tried using requests library ?
like this:
import requests
from bs4 import BeautifulSoup
import sys
flag = 0
textToSearch = 'hello world'
query = sys.argv[0].strip("\"").replace(" ","+")
url = "https://www.youtube.com/results?search_query=" + query
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html,"lxml")
for vid in soup.findAll(attrs={'class':'yt-uix-tile-link'}):
if ('https://www.youtube.com' + vid['href']).startswith("https://www.youtube.com/watch?v="):
flag = 1
print ('https://www.youtube.com' + vid['href'])
if flag == 0:
print ("No results found")

Fix the syntax error of a list comprehension that contains beautiful soup methods

I tried hard but there is always some syntax error with the piece of code that follows.
import urllib.request
import re
import csv
from bs4 import BeautifulSoup
from bs4 import NavigableString
from unicodedata import normalize
url = input('Please paste the link here: ')
html = urllib.request.urlretrieve(url)
html_file = open(html[0])
soup = BeautifulSoup(html_file, 'html5lib')
def contains_href(tag):
return tag.find('a', href=True)
scrollables = [table in soup.find_all('table', class_='sc_courselist') if contains_href(table)]
def num_name_unit(tag):
td_num = tag.find('td', href=True)
num = normalize('NFKD', td_num.string.strip())
td_name = tag.find('td', class_=False)
name = normalize('NFKD', td_name.string.strip())
td_unit = tag.find('td', class_='hourscol')
unit = normalize('NFKD', td_unit.string.strip())
row = ['Course Number: {0} | Course Name: {1} | Course Unit: {2}'.format(num, name, unit)]
return row
dic_rows = {scrollable.find_previous_siblings(re.compile('h'), class_=False, limit=1).string.strip(): list(num_name_unit(tr) for tr in scrollable.find_all('tr', contains_href)) for scrollable in scrollables}
I expect that the terminal would print the following request: "Please paste the link here: ". In reality, it says "invalid syntax" at the end of scrollables = [table in soup.find_all('table', class_='sc_courselist') if contains_href(table)].
enter image description here
You are missing the for part in your list. It should be
[table for table in soup.find_all('table', class_='sc_courselist') if contains_href(table)]

Beautifulsoup python3 Howlongtobeat.com extracting name (and other elements)

Trying to figure out how to extract the name of the game through beautifulsoup
I think i having a problem with the HTML aspect of it
here what I have so far:
from requests import get
url = 'https://howlongtobeat.com/game.php?id=38050'
response = get(url)
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
game_length = html_soup.find_all('div', class_='game_times')
length = (game_length[-1].find_all({'li': ' short time_100 shadow_box'})[-1].contents[3].get_text())
print(length)
game_name = html_soup.find_all('div', class_='profile_header_game')
game = (game_name[].find({"profile_header shadow_text"})[].contents[].get_text())
print(game)
I'm getting the length but not the game name why?
for print(length) prints:
31 Hours
but for print(game) prints:
game_name = html_soup.find_all('div', class_='profile_header_game')
game = (game_name[].find({"profile_header shadow_text"})[].contents[].get_text())
File "", line 1
game = (game_name[].find({"profile_header shadow_text"})[].contents[].get_text())
^
SyntaxError: invalid syntax
print(game)
Traceback (most recent call last):
File "", line 1, in
NameError: name 'game' is not defined
what am I doing wrong?
It looks like there are a few syntax issues in your code. Here is a corrected version:
from bs4 import BeautifulSoup
import requests
url = 'https://howlongtobeat.com/game.php?id=38050'
response = requests.get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
game_times_tag = html_soup.find('div', class_='game_times')
game_time_list = []
for li_tag in game_times_tag.find_all('li'):
title = li_tag.find('h5').text.strip()
play_time = li_tag.find('div').text.strip()
game_time_list.append((title, play_time))
for game_time in game_time_list:
print(game_time)
profile_header_tag = html_soup.find("div", {"class": "profile_header shadow_text"})
game_name = profile_header_tag.text.strip()
print(game_name)
shorter version
game_length = html_soup.select('div.game_times li div')[-1].text
game_name = html_soup.select('div.profile_header')[0].text
developer = html_soup.find_all('strong', string='\nDeveloper:\n')[0].next_sibling

Categories

Resources