how to get products names from amazon - python

I saw all the relevant previous topics about that manner and i've learned a lot (especially about the difference between lxml and html.parser)
anyway after i changed my BeautifulSoup to parse the page as lxml I still can't get all the time the same result .
most of the time i get : " name = soup.find('span', id="productTitle").text
AttributeError: 'NoneType' object has no attribute 'text' "
but in a few times i get the real name of the product.
what am I missing?
import requests
from bs4 import BeautifulSoup
def main():
url = "https://www.amazon.com/Homego-Bluetooth-Wrist-Smart-Handsfree/dp/B01DOULDN0/ref=sr_1_1?keywords=smart+watch&qid=1569450390&sr=8-1"
client = requests.get(url, headers={"User-Agent": "Defined"})
try:
client.raise_for_status()
except requests.exceptions.HTTPError as e:
print("Error!!!!" + str(e))
soup = BeautifulSoup(client.content, 'lxml')
name = soup.find('span', id="productTitle").text
title = name[161:len(name)-128]
print("the title is: ", title)
if __name__ == '__main__':
main()

Presumably you are getting different html back. Check the html itself. You can add in a test of whether variables are None before attempting to access .text
import requests
from bs4 import BeautifulSoup
def main():
url = "https://www.amazon.com/Homego-Bluetooth-Wrist-Smart-Handsfree/dp/B01DOULDN0/ref=sr_1_1?keywords=smart+watch&qid=1569450390&sr=8-1"
client = requests.get(url, headers={"User-Agent": "Defined"})
try:
client.raise_for_status()
except requests.exceptions.HTTPError as e:
print("Error!!!!" + str(e))
soup = BeautifulSoup(client.content, 'lxml')
name = soup.find('span', id="productTitle")
if name is None:
name = 'N/A'
title = 'n/a'
else:
name = name.text
title = name[161:len(name)-128]
print("the title is: ", title)
if __name__ == '__main__':
main()

Related

Web scraping with BS4

I have a problem with scraping some basic info about movies from imdb.com. I want my program to get title and description of a movie from a given URL. The title part is doing its job, however I can't figure out how to get the description. Here's my code:
import requests
from bs4 import BeautifulSoup as bs
def get_data(url):
r = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'})
if not r or 'https://www.imdb.com/title' not in url:
return print('Invalid movie page!')
return r.content
if __name__ == '__main__':
# print('Input the URL:')
# link = input()
link = 'https://www.imdb.com/title/tt0111161'
data = get_data(link)
soup = bs(data, 'html.parser')
title = ' '.join(soup.find('h1').text.split()[:-1])
desc = soup.find('p', {'data-testid':"plot", 'class':"GenresAndPlot__Plot-cum89p-8 kmrpno"}).text
movie_info = {'title': title, 'description': desc}
print(movie_info)
When I run it I get an error:
Exception has occurred: AttributeError
'NoneType' object has no attribute 'text'
File "movie-scraper.py", line 18, in <module>
desc = soup.find('p', {'data-testid':"plot", 'class':"GenresAndPlot__Plot-cum89p-8 kmrpno"}).text
How do I access the description properly?
To get plot summary, change the selector to find class="plot_summary":
import requests
from bs4 import BeautifulSoup as bs
def get_data(url):
r = requests.get(url, headers={"Accept-Language": "en-US,en;q=0.5"})
if not r or "https://www.imdb.com/title" not in url:
return print("Invalid movie page!")
return r.content
if __name__ == "__main__":
link = "https://www.imdb.com/title/tt0111161"
data = get_data(link)
soup = bs(data, "html.parser")
title = " ".join(soup.find("h1").text.split()[:-1])
desc = soup.find("div", class_="plot_summary").get_text(strip=True) # <-- change this to find class="plot_summary"
movie_info = {"title": title, "description": desc}
print(movie_info)
Prints:
{'title': 'The Shawshank Redemption', 'description': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.Director:Frank DarabontWriters:Stephen King(short story "Rita Hayworth and Shawshank Redemption"),Frank Darabont(screenplay)Stars:Tim Robbins,Morgan Freeman,Bob Gunton|See full cast & crew»'}

How to get all listings urls from main page with python web scraping

I wrote a code for web scraping, My code is ok just except two issues. From detail page, everything is ok just ISBN NO, and from main page, I need all listing URLs so that my code could scrape date from aa listings. Please guide me how can I fix this issue. Both(main page and details page )URLs are in the code. Thank you!
here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = soup.find('span',class_="title product-field",id=False).text
except:
title = 'empty'
print(title)
try:
writer = soup.find('a',class_="contributor-name",id=False).text
except:
writer = 'empty'
print(writer)
try:
original_price = soup.find('div',class_="original-price",id=False).find('span').text
except:
original_price = 'empty'
print(original_price)
try:
active_price = soup.find('div',class_="active-price",id=False).find('span').text
except:
active_price = 'empty'
print(active_price)
try:
img = soup.find('div',class_="image-actions image-container product-type-icon-container book",id=False).find('img').attrs['src']
except:
img = 'empty'
print(img)
try:
isbn = soup.find('div',class_="bookitem-secondary-metadata",id=False).find('li').attrs['ISBN: ']
except:
isbn = 'empty'
print(isbn)
data = {
'title' : title,
'writer' : writer,
'original_price' : original_price,
'active_price' : active_price,
'image' : img,
'isbn' : isbn
}
return data
def get_index_data(soup):
titles_link = soup.find_all('a',class_="body_link_11")
try:
inks = soup.find('div', class_="item-info",id=False).find('p').find('a').get('href')
except:
inks = "empty"
print(inks)
def main():
#detail_page_url = "https://www.kobo.com/ww/en/ebook/mum-dad-1"
mainurl = "https://www.kobo.com/ww/en/list/new-hot-in-fiction/youL53408U25RHrVu3wR5Q"
#get_page(url)
#get_detail_data(get_page(detail_page_url))
get_index_data(get_page(mainurl))
if __name__ == '__main__':
main()
import requests
import re
import json
from bs4 import BeautifulSoup
import csv
def Soup(content):
soup = BeautifulSoup(content, 'html.parser')
return soup
def Main(url):
r = requests.get(url)
soup = Soup(r.content)
scripts = soup.findAll("script", type="application/ld+json",
text=re.compile("data"))
prices = [span.text for span in soup.select(
"p.product-field.price span span") if span.text != "USD"]
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Title", "Writer", "Price", "ISBN", "IMG", "URL"])
for script, price in zip(scripts, prices):
script = json.loads(script.text)
title = script["data"]["name"]
author = script["data"]["author"][0]["name"]
img = f'https:{script["data"]["thumbnailUrl"]}'
isbn = script["data"]["isbn"]
url = script["data"]["url"]
writer.writerow([title, author, price, isbn, img, url])
Main("https://www.kobo.com/ww/en/list/new-hot-in-fiction/youL53408U25RHrVu3wR5Q")
Output: View-Online
Output Sample:

I have issue with data scraping with python

I wrote a code for data scraping from airbnb.com actually I want to scrape all comments with details like listing name, total revies, revies, commenter name, date, comments but my code does not execute the try part it directly goes to except part. please guide me on how can I fix this issue. thank you!
here is my code:
import requests
from bs4 import BeautifulSoup
#import pandas as pd
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = soup.find_all('span',class_="_18hrqvin",id=False).text
except:
title = 'empty'
print(title)
try:
reviews = soup.find_all('div',class_="_10za72m2",id=False).text
except:
reviews = 'empty revies'
print(reviews)
try:
total_reviews = soup.find_all('span',class_="_krjbj",id=False).text
except:
total_reviews = 'empty total revies'
print(total_reviews)
try:
total_reviews = soup.find_all('span',class_="_krjbj",id=False).text
except:
total_reviews = 'empty total revies'
print(total_reviews)
try:
commenter_name = soup.find_all('div',class_="_1p3joamp",id=False).text
except:
commenter_name = 'empty commenter_name'
print(commenter_name)
try:
comment_date = soup.find_all('span',class_="_1jlnvra2",id=False).text
except:
comment_date = 'empty comment_date'
print(comment_date)
try:
comment_date = soup.find_all('span',class_="_1jlnvra2",id=False).text
except:
comment_date = 'empty comment_date'
print(comment_date)
try:
comment = soup.find_all('div',class_="_czm8crp",id=False).text
except:
comment = 'empty comment'
print(comment)
def main():
url = "https://www.airbnb.com/rooms/34826867?source_impression_id=p3_1584615891_nVK823DKHNHuFWCQ"
get_detail_data(get_page(url))
if __name__ == '__main__':
main()
As suggested by #arcticsanto there is your get_page returns None if a bad response is found so get_detail_data fails to get BeautifulSoup so just add validation for soup in get_detail_data
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
if not soup:
return
----

I am trying to parse data from all pages. Only the first page is parsed

I am trying to parse data from all pages. Parsing ends after the first page. What could be the problem?
I use pagination with the use of a regular expression.
The first page of the site and others differ in the html code, so I have to create two different functions main_1 and main_2 for the first and other pages.
If you try to run only the main_2 function, nothing will work. .CSV file will not be created.
help me please.
import requests
from bs4 import BeautifulSoup
import csv
import re
def get_html(url):
r = requests.get(url)
if r.ok:
return r.text
print(r.status_code)
def writer_csv(data):
with open('tesr.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow((data['name'], data['url'], data['price']))
def get_data_page(html):
soup = BeautifulSoup(html, 'lxml')
trs = soup.find_all('tr', class_='cmc-table-row')
for tr in trs:
tds = tr.find_all('td')
try:
name = tds[1].find('a', class_='cmc-link').text.strip()
except:
name = ''
try:
url = 'https://coinmarketcap.com' + str(tds[1].find('a', class_='cmc-link').get('href'))
except:
url = ''
try:
price = tr.find('td', class_='cmc-table__cell--sort-by__price').find('a').text.strip().replace('$', '')
except:
price = ''
data = {'name': name,
'url': url,
'price': price}
writer_csv(data)
def main_1():
url_1 = 'https://coinmarketcap.com/'
get_data_page(get_html(url_1))
def main_2():
url_2 = 'https://coinmarketcap.com/2/'
while True:
get_data_page(get_html(url_2))
soup = BeautifulSoup(get_html(url_2), 'lxml')
try:
pattern = 'Next '
url_2 = 'https://coinmarketcap.com' + str(soup.find('ul', class_='pagination').find('a', text=re.compile(pattern)).get('href'))
except:
break
main_1()
main_2()

return BeautifulSoup AttributeError: 'NoneType' object has no attribute 'encode'

Good Night, why my return "dollar", is coming return "BeautifulSoup AttributeError: 'NoneType' object has no attribute 'encode'", what am I doing wrong? I want to bring the dollar quote in this code
import
from bs4 import BeautifulSoup
import urllib.request as url
site = "https://economia.uol.com.br/cotacoes/"
try:
url2 = site
con = url.urlopen(url2, None, 7)
if con.status == 200:
HTML = con.read()
soup = BeautifulSoup(HTML, 'html.parser')
dolar = re.search(r'Cotacao do Dolar: ',
str(soup.find('div', attrs={'class': 'subtituloGrafico subtituloGraficoValor'})))
return dolar
except Exception as e:
if str(e) == 'HTTP Error 404: NOT FOUND':
return 'Error'
return str(e)
As #Pythonista said because nothing in the soup was found. I try to search what you need by class in browser, then i found that class="subtituloGrafico subtituloGraficoValor" belongs to <a>. So i understand what you may desire to get.
Note: soup.find("h3",{"class":"tituloGrafico"}) is used to narrow your search
from bs4 import BeautifulSoup
import urllib.request as url
import re
site = "https://economia.uol.com.br/cotacoes/"
def func(site=site):
try:
url2 = site
con = url.urlopen(url2, None, 7)
if con.status == 200:
HTML = con.read()
soup = BeautifulSoup(HTML, 'lxml')
sub = soup.find("h3",{"class":"tituloGrafico"})
title = sub.findChild("a").text
dolar = sub.find('a', attrs={'class': "subtituloGrafico subtituloGraficoValor"}).text
print(title,dolar)
#dolar = re.search(r'Cotacao do Dolar:(.*?)',text)
return dolar
except Exception as e:
raise e
if str(e) == 'HTTP Error 404: NOT FOUND':
return 'Error'
return str(e)
func()
Simple. This:
soup.find('div', attrs={'class': 'subtituloGrafico subtituloGraficoValor'})
Is returning None because nothing in the soup was found. Make sure you're actually searching this correctly it's not created dynamically etcetera.
You can also search by text and extract it using BS without explicitly using re.

Categories

Resources