How to extract movie genre from Metacritic website using BeautifulSoup - python

I want to do this for the top 500 movies of Metacritic found at https://www.metacritic.com/browse/movies/score/metascore/all/filtered?sort=desc
Each genre will be extracted from a detail link like this(for the first one): https://www.metacritic.com/movie/citizen-kane-1941/details
Just need some help on the extraction of the genre part from the HTML from the above-detailed link
My get_genre function (but I get an attribute error)
def get_genre(detail_link):
detail_page = requests.get(detail_link, headers = headers)
detail_soup = BeautifulSoup(detail_page.content, "html.parser")
try:
#time.sleep(1)
table=detail_soup.find('table',class_='details',summary=movie_name +" Details and Credits")
#print(table)
gen_line1=table.find('tr',class_='genres')
#print(gen_line1)
gen_line=gen_line1.find('td',class_='data')
#print(gen_line)
except:
time.sleep(1)
year=detail_soup.find(class_='release_date')
year=year.findAll('span')[-1]
year=year.get_text()
year=year.split()[-1]
table=detail_soup.find('table',class_='details',summary=movie_name +" ("+ year +")"+" Details and Credits")
#print(table)
gen_line1=table.find('tr',class_='genres')
#print(gen_line1)
gen_line=gen_line1.find('td',class_='data')
genres=[]
for line in gen_line:
genre = gen_line.get_text()
genres.append(genre.strip())
genres=list(set(genres))
genres=(str(genres).split())
return genres

you're too much focused on getting the table. just use what elements you're sure about. here's an example with select
from bs4 import BeautifulSoup
import requests
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_0) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/58.0.849.0 Safari/536.1'}
detail_link="https://www.metacritic.com/movie/citizen-kane-1941/details"
detail_page = requests.get(detail_link, headers = headers)
detail_soup = BeautifulSoup(detail_page.content, "html.parser")
genres=detail_soup.select('tr.genres td.data span')
print([genre.text for genre in genres])
>>> ['Drama', 'Mystery']

Related

BS4 - 'NoneType' object has no attribute 'findAll' when scanning spans on amazon page

I'm following a Udemy course on learning BS4 and it seems to be a bit outdated so I'm having trouble with this part.
The objective is to scrape the price of this TV from this amazon page, and in the course the instructor also gets this error and says he fixes it by changing the class name he's searching for via findAll. I tried the same thing (meaning different class not the same one he used) and was met again with the attribute error. According to the answer for a similar issue, the class being searched for didn't contain what was being looked for, but I don't believe the same is happening to me.
The code: https://pastebin.com/SMQBXt31
`
from datetime import datetime
import requests
import csv
import bs4
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15"
REQUEST_HEADER = {
"User-Agent": USER_AGENT,
"Accept-Language": "en-US, en;q=0.5"
}
def get_page_html(url):
res = requests.get(url=url, headers=REQUEST_HEADER) #res = response
return res.content
def get_product_price(soup):
main_price_span = soup.find("span", attrs={
"class": "a-price aok-align-center reinventPricePriceToPayPadding priceToPay"
})
price_spans = main_price_span.findAll("span")
for span in price_spans:
price = span.text.strip().replace("$", "").replace(",", "")
print(price)
def extract_product_info(url):
product_info = {}
print(f"Scraping URL: {url}")
html = get_page_html(url)
soup = bs4.BeautifulSoup(html, "lxml")
product_info["price"] = get_product_price(soup)
if __name__ == '__main__':
with open("amazon_products_urls.csv", newline="") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
for row in reader:
url = row[0]
print(extract_product_info(url))
`
The website:https://www.amazon.com/Hisense-Premium-65-Inch-Compatibility-65U8G/dp/B091XWTGXL/ref=sr_1_1_sspa?crid=3NYCKNFHL6DU2&keywords=hisense%2Bpremium%2B65%2Binch&qid=1651840513&sprefix=hisense%2Bpremium%2B65%2Binch%2B%2Caps%2C116&sr=8-1-spons&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEyVzUyTjBMS1JCVFVRJmVuY3J5cHRlZElkPUEwNDY2ODc0MlozVlFMVFJKQ0s2VyZlbmNyeXB0ZWRBZElkPUEwODI5OTgxMTRZSjdMMzYyQjk4NyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1
There are lot of spans from that you have to select only the price span class correctly which are located in [class="a-size-mini olpWrapper"]
price_spans = main_price_span.find_all("span",class_="a-size-mini olpWrapper")
for span in price_spans:
price = span.text.strip().replace("$", "").replace(",", "")
print(price)
#OR
price_spans =[x.get_text(strip=True).replace("$", "") for x in main_price_span.find("span",class_="a-size-mini olpWrapper")]

How to select first element in multi-valued html tags?

I'm developing a web scraping to collect some information from AllMusic. However, I am having difficulties to correctly return information when there is more than one option inside the tag (e.g. href).
Question: I need to return the first music genre for each artist. In the case of one value per artist, my code works. However, in situations with more than one music genre, I'm not able to select just the first one.
Here is the code created:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
artists =['Alexander 23', 'Alex & Sierra', 'Tion Wayne', 'Tom Cochrane','The Waked']
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
performer = []
links = []
genre = []
for artist in artists:
url= urllib.request.urlopen("https://www.allmusic.com/search/artist/" + urllib.parse.quote(artist))
soup = BeautifulSoup(requests.get(url.geturl(), headers=headers).content, "html.parser")
div = soup.select("div.name")[0]
link = div.find_all('a')[0]['href']
links.append(link)
for l in links:
soup = BeautifulSoup(requests.get(l, headers=headers).content, "html.parser")
divGenre= soup.select("div.genre")[0]
genres = divGenre.find('a')
performer.append(artist)
genre.append(genres.text)
df = pd.DataFrame(zip(performer, genre, links), columns=["artist", "genre", "link"])
df
Hopfully understand your question right - Main issue is that you iterate the links inside your for-loop and that causes the repetition.
May change your strategy, try to get all information in one iteration and store them in a more structured way.
Example
import requests
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
artists =['Alexander 23', 'Alex & Sierra', 'Tion Wayne', 'Tom Cochrane','The Waked']
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
data = []
for artist in artists:
url= urllib.request.urlopen("https://www.allmusic.com/search/artist/" + urllib.parse.quote(artist))
soup = BeautifulSoup(requests.get(url.geturl(), headers=headers).content, "html.parser")
link = soup.select_one("div.name a").get('href')
soup = BeautifulSoup(requests.get(link, headers=headers).content, "html.parser")
data.append({
'artist':artist,
'genre':soup.select_one("div.genre a").text,
'link':link
})
print(pd.DataFrame(data).to_markdown(index=False))
Output
artist
genre
link
Alexander 23
Pop/Rock
https://www.allmusic.com/artist/alexander-23-mn0003823464
Alex & Sierra
Folk
https://www.allmusic.com/artist/alex-sierra-mn0003280540
Tion Wayne
Rap
https://www.allmusic.com/artist/tion-wayne-mn0003666177
Tom Cochrane
Pop/Rock
https://www.allmusic.com/artist/tom-cochrane-mn0000931015
The Waked
Electronic
https://www.allmusic.com/artist/the-waked-mn0004025091

Price comparison - python

Hi guys i am trying to create a program in python that compares prices from websites but i cant get the prices. I have managed to ge the title of the product and the quantity using the code bellow.
page = requests.get(urls[7],headers=Headers)
soup = BeautifulSoup(page.text, 'html.parser')
title = soup.find("h1",{"class" : "Titlestyles__TitleStyles-sc-6rxg4t-0 fDKOTS"}).get_text().strip()
quantity = soup.find("li", class_="quantity").get_text().strip()
total_price = soup.find('div', class_='Pricestyles__ProductPriceStyles-sc-118x8ec-0 fzwZWj price')
print(title)
print(quantity)
print(total_price)
Iam trying to get the price from this website (Iam creating a program do look for diper prices lol) https://www.drogasil.com.br/fralda-huggies-tripla-protecao-tamanho-m.html .
the price is not coming even if i get the text it always says that its nonetype.
Some of the information is built up via javascript from data stored in <script> sections in the HTML. You can access this directly by searching for it and using Python's JSON library to decode it into a Python structure. For example:
from bs4 import BeautifulSoup
import requests
import json
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
url = 'https://www.drogasil.com.br/fralda-huggies-tripla-protecao-tamanho-m.html'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
script = soup.find('script', type='application/ld+json')
data = json.loads(script.text)
title = data['name']
total_price = data['offers']['price']
quantity = soup.find("li", class_="quantity").get_text().strip()
print(title)
print(quantity)
print(total_price)
Giving you:
HUGGIES FRALDAS DESCARTAVEL INFANTIL TRIPLA PROTECAO TAMANHO M COM 42 UNIDADES
42 Tiras
38.79
I recommend you add print(data) to see what other information is available.

BeautifulSoup: how to get the value of the price from the webpage's source code if there is no id within the source code for the price

I am doing web scraping in Python with BeautifulSoup and wondering if I there is a way of getting the value of a cell when it has no id. The code is as below:
from bs4 import BeautifulSoup
import requests
import time
import datetime
URL = "https://www.amazon.co.uk/Got-Data-MIS-Business-Analyst/dp/B09F319PK2/ref=sr_1_1?keywords=funny+got+data+mis+data+systems+business+analyst+tshirt&qid=1636481904&qsid=257-9827493-6142040&sr=8-1&sres=B09F319PK2%2CB09F33452D%2CB08MCBFLHC%2CB07Y8Z4SF8%2CB07GJGXY7P%2CB07Z2DV1C2%2CB085MZDMZ8%2CB08XYL6GRM%2CB095CXJ226%2CB08JDMYMPV%2CB08525RB37%2CB07ZDNR6MP%2CB07WL5JGPH%2CB08Y67YF63%2CB07GD73XD8%2CB09JN7Z3G2%2CB078W9GXJY%2CB09HVDRJZ1%2CB07JD7R6CB%2CB08JDKYR6Q&srpt=SHIRT"
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 14092.77.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.107 Safari/537.36"}
page = requests.get(URL, headers = headers)
soup1 = BeautifulSoup(page.content, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
title = soup2.find(id="productTitle").get_text()
price = soup2.find(id="priceblock_ourprice").get_text()
print(title)
print(price)
For this page, you have to select the garment size before the price is displayed. We can get the price from the dropdown list of sizes which is a SELECT with id = "dropdown_selected_size_name"
First let's get a list of the options in the SELECT dropdown:
options = soup2.find(id='variation_size_name').select('select option')
Then we can get the price say for size 'Large'
for opt in options:
if opt.get('data-a-html-content', '') == 'Large':
print(opt['value'])
or a little more succinctly:
print([opt['value'] for opt in options if opt.get('data-a-html-content', '') == 'Large'][0])

Python Youtube Web Scraper not working properly

So i built this small script that would give back a URL of any searched video on youtube. But after opening it up again turns out that the web scraping with youtube is not working out properly. As when printing soup it returns something completely different than from what can be seen with inspect element on Youtube. Can someone help me solve this...
Heres My Code:
import requests
from lxml import html
import webbrowser
from bs4 import BeautifulSoup
import time
import tkinter
from pytube import YouTube
headers= {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36"}
def video_finder():
word = input("Enter video title: ")
if ' ' in word:
new = word.replace(' ', '+')
print(new)
else:
pass
vid = requests.get('https://www.youtube.com/results?search_query={}'.format(new))
soup = BeautifulSoup(vid.text, features='lxml')
all_vids = soup.find_all('div', id_='contents')
print(all_vids)
video1st = all_vids[0]
a_Tag = video1st.find('a', class_="yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link", href=True)
Video_name = a_Tag.text
Video_id = a_Tag['href']
video_link = 'https://www.youtube.com' + Video_id
print(Video_name)
print(video_link)
Its not the best but ye... thank you
To get correct result from Youtube page, set User-Agent HTTP header to Googlebot, and use html.parser in BeautifulSoup.
For example:
import requests
from bs4 import BeautifulSoup
headers= {"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
def video_finder():
word = input("Enter video title: ")
params = {
'search_query': word
}
vid = requests.get('https://www.youtube.com/results', params=params, headers=headers)
soup = BeautifulSoup(vid.content, features='html.parser')
a_Tag = soup.find('a', class_="yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link", href=lambda h: h.startswith('/watch?'))
Video_name = a_Tag.text
Video_id = a_Tag['href']
video_link = 'https://www.youtube.com' + Video_id
print(Video_name)
print(video_link)
video_finder()
Prints:
Enter video title: sailor moon
Sailor Moon Opening (English) *HD*
https://www.youtube.com/watch?v=5txHGxJRwtQ

Categories

Resources