scraping all reviews of a movie from Rotten tomato using soup - python

I am trying to get all reviews of a movie from here: https://www.rottentomatoes.com/m/interstellar_2014/reviews. But as you see on the web page they only show about 19 reviews. So I am unable to get all reviews my code bellow only prints the 19 first reviews.
## First we import the module necessary to open URLs (basically websites)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
def scrapeUrl(URL):
""" scrape data from url - give url as a parameter """
page = urlopen(URL)
html_bytes = page.read()
html = html_bytes.decode("utf-8")
#print(HTML)
soup = BeautifulSoup(html, "html.parser")
return soup
def findReviews(soup):
""" find reviews using class="the_review" """
NoneType = type(None)
reviews = []
for element in soup.find_all("div"):
i = element.get("class")
if isinstance(i, NoneType) == False:
if 'the_review' in i:
reviews.append(element.text)
dfrev = pd.DataFrame(reviews, columns= ['reviews'])
return dfrev
url = "https://www.rottentomatoes.com/m/interstellar_2014/reviews"
sc = scrapeUrl(URL)
t = findReviews(sc)
print(t)

You can do this without BeautifulSoup, as rottentomatoes retrieves the reviews from an api. So you could first extract the movie id from the url with regex, then loop api requests until the last page and load the data with pandas:
import pandas as pd
import requests
import re
headers = {
'Referer': 'https://www.rottentomatoes.com/m/notebook/reviews?type=user',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
s = requests.Session()
def get_reviews(url):
r = requests.get(url)
movie_id = re.findall(r'(?<=movieId":")(.*)(?=","type)',r.text)[0]
api_url = f"https://www.rottentomatoes.com/napi/movie/{movie_id}/criticsReviews/all" #use reviews/userfor user reviews
payload = {
'direction': 'next',
'endCursor': '',
'startCursor': '',
}
review_data = []
while True:
r = s.get(api_url, headers=headers, params=payload)
data = r.json()
if not data['pageInfo']['hasNextPage']:
break
payload['endCursor'] = data['pageInfo']['endCursor']
payload['startCursor'] = data['pageInfo']['startCursor'] if data['pageInfo'].get('startCursor') else ''
review_data.extend(data['reviews'])
time.sleep(1)
return review_data
data = get_reviews('https://www.rottentomatoes.com/m/interstellar_2014/reviews')
df = pd.json_normalize(data)
creationDate
isFresh
isRotten
isRtUrl
isTop
reviewUrl
quote
reviewId
scoreOri
scoreSentiment
critic.name
critic.criticPictureUrl
critic.vanity
publication.id
publication.name
0
Oct 9, 2021
True
False
False
False
https://www.nerdophiles.com/2014/11/05/interstellar-delivers-beauty-and-complexity-in-typical-nolan-fashion/
The inherent message of the film brings hope, but it can definitely get waterlogged by intellectual speak and long-winded scenes.
2830324
3/5
POSITIVE
Therese Lacson
http://resizing.flixster.com/gGcp41zlZQ3sYdSbQoS8AATHp8Y=/128x128/v1.YzszODg1O2o7MTg5OTA7MjA0ODszMDA7MzAw
therese-lacson
3888
Nerdophiles
1
Aug 10, 2021
True
False
False
False
https://www.centraltrack.com/space-oddity/
The film is indeed a sight to behold -- and one that demands to be seen on the biggest possible screen.
2812665
B
POSITIVE
Kip Mooney
http://resizing.flixster.com/hoYjdO_o-Ip21XnJaWr0C27-nbc=/128x128/v1.YzszOTk2O2o7MTg5OTA7MjA0ODs0MDA7NDAw
kip-mooney
2577
Central Track
2
Feb 2, 2021
True
False
False
False
http://www.richardcrouse.ca/interstellar-3-stars-one-for-each-hour-of-the-movie-sentimental-sic-fi/
Nolan reaches for the stars with beautifully composed shots and some mind-bending special effects, but the dime store philosophy of the story never achieves lift off.
2763105
3/5
POSITIVE
Richard Crouse
http://resizing.flixster.com/Ep5q7RwWq9Ud5KBhnha2sPnsRD0=/128x128/v1.YzszODgxO2o7MTg5OTA7MjA0ODszMDA7MzAw
richard-crouse
3900
Richard Crouse

Related

Problem with scraping website 2 pages opens rest not

So i have been trying to write data scraper for online shop with cables and other stuff.
I wrote simple code that should work. Shop has structure of products divided to categories and i took on first category with cables.
for i in range(0, 27):
url = "https://onninen.pl/produkty/Kable-i-przewody?query=/strona:{0}"
url = url.format(i)
and it works fine for first two pages with i = to 0 and 1 (i get code_response 200) but no matter what time i try other pages 2+ returns error 500 and i have no idea why especially when they open normally from the same link manually.
I even tried to randomize time between requests :(
Any idea what might be the problem ? Should i try using other web scraping library ?
Below is full code :
import requests
from fake_useragent import UserAgent
import pandas as pd
from bs4 import BeautifulSoup
import time
import random
products = [] # List to store name of the product
MIN = [] # Manufacturer item number
prices = [] # List to store price of the product
df = pd.DataFrame()
user_agent = UserAgent()
i = 0
for i in range(0, 27):
url = "https://onninen.pl/produkty/Kable-i-przewody?query=/strona:{0}"
url = url.format(i)
#print(url)
# getting the response from the page using get method of requests module
page = requests.get(url, headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"})
#print(page.status_code)
# storing the content of the page in a variable
html = page.content
# creating BeautifulSoup object
page_soup = BeautifulSoup(html, "html.parser")
#print(page_soup.prettify())
for containers in page_soup.findAll('div', {'class': 'styles__ProductsListItem-vrexg1-2 gkrzX'}):
name = containers.find('label', attrs={'class': 'styles__Label-sc-1x6v2mz-2 gmFpMA label'})
price = containers.find('span', attrs={'class': 'styles__PriceValue-sc-33rfvt-10 fVFAzY'})
man_it_num = containers.find('div', attrs={'title': 'Indeks producenta'})
formatted_name = name.text.replace('Dodaj do koszyka: ', '')
products.append(formatted_name)
prices.append(price.text)
MIN.append(man_it_num.text)
df = pd.DataFrame({'Product Name': products, 'Price': prices, 'MIN': MIN})
time.sleep(random.randint(2, 11))
#df.to_excel('output.xlsx', sheet_name='Kable i przewody')
Because Total pages loaded dynamically via API. So to get all data, you have to use API.
Example:
import pandas as pd
import requests
api_url = 'https://onninen.pl/api/search?query=/Kable-i-przewody/strona:{p}'
headers = {
'user-agent': 'Mozilla/5.0',
'referer': 'https://onninen.pl/produkty/Kable-i-przewody?query=/strona:2',
'cookie': '_gid=GA1.2.1022119173.1663690794; _fuid=60a315c76d054fd5add850c7533f529e; _gcl_au=1.1.1522602410.1663690804; pollsvisible=[]; smuuid=1835bb31183-22686567c511-4116ddce-c55aa071-2639dbd6-ec19e64a550c; _smvs=DIRECT; poll_random_44=1; poll_visited_pages=2; _ga=GA1.2.1956280663.1663690794; smvr=eyJ2aXNpdHMiOjEsInZpZXdzIjo3LCJ0cyI6MTY2MzY5MjU2NTI0NiwibnVtYmVyT2ZSZWplY3Rpb25CdXR0b25DbGljayI6MCwiaXNOZXdTZXNzaW9uIjpmYWxzZX0=; _ga_JXR5QZ2XSJ=GS1.1.1663690794.1.1.1663692567.0.0.0'
}
dfs = []
for p in range(1,28):
d=requests.get(api_url.format(p=p),headers=headers).json()['items'][0]['items']
df = pd.DataFrame(d)
dfs.append(df)
df = pd.concat(dfs)
print(df)
Output:
id slug index catalogindex ... onntopcb isnew qc ads
0 147774 KABLE-ROZNE-MARKI-Kabel-energetyczny-YKY-ZO-3x... HES890 112271067D0500 ... 0 False None None
1 45315 KABLE-ROZNE-MARKI-Kabel-energetyczny-YKY-ZO-3x... HES893 112271068D0500 ... 0 False None None
2 169497 KABLE-ROZNE-MARKI-Kabel-energetyczny-YKY-ZO-3x... HES896 112271069D0500 ... 0 False None None
3 141820 KABLE-ROZNE-MARKI-Kabel-energetyczny-YKY-ZO-4x... HES900 112271056D0500 ... 0 False None None
4 47909 KABLE-ROZNE-MARKI-Kabel-energetyczny-YKY-ZO-4x... HES903 112271064D0500 ... 0 False None None
.. ... ... ... ... ... ... ... ... ...
37 111419 NVENT-RAYCHEM-Kabel-grzejny-EM2-XR-samoreguluj... HDZ938 449561-000 ... 0 True None None
38 176526 NVENT-RAYCHEM-Przewod-stalooporowy-GM-2CW-35m-... HEA099 SZ18300102 ... 0 False None None
39 38484 DEVI-Mata-grzewcza-DEVIheat-150S-150W-m2-375W-... HAJ162 140F0332 ... 1 False None None
40 60982 DEVI-Mata-grzewcza-DEVImat-150T-150W-m2-375W-0... HAJ157 140F0448 ... 1 False None None
41 145612 DEVI-Czujnik-Devireg-850-rynnowy-czujnik-140F1... HAJ212 140F1086 ... 0 False None None
[1292 rows x 27 columns]

Web-Scraping using BeautifulSoup (missing values when scraping)

I have been trying to webscrape a realtor website using BeautifulSoup and encountered 2 difficulties that I cannot seem to fix.
Difficulties:
When I run my code below, I am missing some date values. The dataframe should hold 68 rows of data scraped from the first page. The description and title scrapes return 68 rows, but the date scrape returns 66. I don't get N/A values returned if its missing either. Does anyone have an idea why this is? When I inspected the website elements it had the same tags, except it is listed as VIP or Special (promotion) apartments.
Secondly, I cannot seem to figure out how to scrape meta itemprop tags. I keep getting blank values when I use:
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):
Thank you in advance for any assistance you could provide.
Python Code:
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd
def get_headers():
#Headers
headers={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language':'en-US,en;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
return headers
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination
#Make list holder
title = []
description = []
date = []
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']
for x in urls:
count=1
y=x
while(count < 2): # will get only 1st page
print(x)
req = Request(x, headers=get_headers()) #req all headers
htmlfile = urlopen(req)
htmltext = htmlfile.read()
soup = bsoup(htmltext,'html.parser')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('a', attrs={'class':'announcement-block__title'}):
text = tag2.get_text().strip()
if len(text) > 0:
title.append(text)
else:
title.append('N/A')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__description'}):
text = tag2.get_text().strip()
if len(text) > 0:
description.append(text)
else:
description.append('N/A')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):
text = tag2.get_text().strip()
if len(text) > 0:
date.append(text)
else:
date.append('N/A')
# Go to next page
count=count+1
page = '?page='+str(count)
x=y+page
data_frame = pd.DataFrame(list(zip(title,description,date)),columns=['Title', 'Description', 'Date'])
You get 66 items because your date[] contains only 66 elements, therefore, you need to check all three fields at once in one for loop. Your if else checks do nothing as there are no announcement-block__date divs with empty content on the page.
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd
def get_headers():
#Headers
headers={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language':'en-US,en;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
return headers
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination
#Make list holder
info = {
'title': [],
'description': [],
'date': []
}
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']
for x in urls:
count=1
y=x
while(count < 2): # will get only 1st page
print(x)
req = Request(x, headers=get_headers()) #req all headers
htmlfile = urlopen(req)
htmltext = htmlfile.read()
soup = bsoup(htmltext,'html.parser')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
title = tag.find('a', attrs={'class':'announcement-block__title'})
description = tag.find('div', attrs={'class':'announcement-block__description'})
date = tag.find('div', attrs={'class':'announcement-block__date'})
info['title'].append(title.get_text().strip() if title else 'N/A')
info['description'].append(description.get_text().strip() if description else 'N/A')
info['date'].append(date.get_text().strip() if date else 'N/A')
# Go to next page
count=count+1
page = '?page='+str(count)
x=y+page
data_frame = pd.DataFrame(list(zip(info['title'], info['description'], info['date'])),columns=['Title', 'Description', 'Date'])
print(len(info['title']), len(info['description']), len(info['date']))
print(data_frame)
About your second question, a similar question has already been answered here

Trouble with Beautiful Soup Scraping

I am working on scraping multiple pages of search results from this website into a neatly formated pandas dataframe.
I've outlined the steps for how I am to finish this task.
1.) Identify information from each result I want to pull (3 things)
2.) Pull all the information from the 3 things into separate lists
3.) Append items in lists through for loop into pandas dataframe
Here is what I've tried so far:
import requests
import pandas as pd
#!pip install bs4
from bs4 import BeautifulSoup as bs
url = 'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
soup = bs(result.text, 'html.parser')
titles = soup.find_all('h5')
authors = soup.find_all('p')
#dates = soup.find_all('')
#append in for loop
data=[]
for i in range(2,22):
data.append(titles[i].text)
data.append(authors[i].text)
#data.append(dates[i].text)
data=pd.DataFrame()
Before I convert data to a pandas dataframe, I can see the results, but the last line essentially erases the results.
Also, I'm not quite sure how to iterate over the multiple search result pages. I found some code that allows you to pick a starting and ending web page to iterate over like this:
URL = ['https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy&page=2',
'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy&page=4']
for url in range(0,2):
req = requests.get(URL[url])
soup = bs(req.text, 'html.parser')
titles = soup.find_all('h5')
print(titles)
The issue I'm having with this approach is that the first page is not formatted the same as all the other pages. Starting on page two, the end of the url reads, "&page=2". Not sure how to account for that.
To summarize the end result I'm looking for would be a dataframe that looks something like this:
Title Author Date
Blah1 Agency1 09/23/2020
Blah2 Agency2 08/22/2018
Blah3 Agency3 06/02/2017
....
Can someone please help point me in the right direction? Very lost on this one.
I think you don't need to parse all pages, just download the csv.
import pandas as pd
import requests
import io
url = 'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy'
url += '&format=csv' # <- Download as CSV
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
df = pd.read_csv(io.StringIO(result.text))
Output:
>>> df
title type ... pdf_url publication_date
0 Corporate Average Fuel Economy Standards for M... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/03/2021
1 Public Hearing for Corporate Average Fuel Econ... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/14/2021
2 Investigation of Urea Ammonium Nitrate Solutio... Notice ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/08/2021
3 Anchorage Regulations; Mississippi River, Mile... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-08... 08/30/2021
4 Call for Nominations To Serve on the National ... Notice ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/08/2021
.. ... ... ... ... ...
112 Endangered and Threatened Wildlife and Plants;... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/07/2021
113 Energy Conservation Program: Test Procedures f... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/01/2021
114 Taking of Marine Mammals Incidental to Commerc... Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/17/2021
115 Partial Approval and Partial Disapproval of Ai... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/24/2021
116 Clean Air Plans; California; San Joaquin Valle... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/01/2021
[117 rows x 8 columns]
If I understand your question, then here is the working solution. The starting url and the url with page number = 1 are the same thing and I scrape page range(1,5) meaning 4 pages. You can increase or decrease range of page numbers at any time. To store data in csv format, please uncomment the last line.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'}
for page in range(1, 5):
url = 'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy%27&page={page}'.format(page=page)
print(url)
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
tags = soup.find_all('div', class_ ='document-wrapper')
for pro in tags:
title = pro.select_one('h5 a').get_text(strip = True)
author = pro.select_one('p a:nth-child(1)').get_text(strip = True)
date = pro.select_one('p a:nth-child(2)').get_text(strip = True)
data.append([title,author,date])
cols = ["Title", "Author","Date"]
df = pd.DataFrame(data,columns=cols)
print(df)
#df.to_csv("data_info.csv", index = False)

How do I web scrape Worldometer charts?

Our organization is using Worldometers for COVID-19 data. I'm able to scrape the page state data, but our leaders want the 7-day moving average for new cases and deaths. To do this manually, you have to click on the 7-day moving average button and hover over today's date. Is there an automated method or module that is available to the public?
Link I can web scrape: https://www.worldometers.info/coronavirus/country/us/
Data I need in the images below.
You can use regex to pull that out:
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.worldometers.info/coronavirus/country/us/'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if "Highcharts.chart('graph-cases-daily'" in str(script):
jsonStr = str(script)
data = re.search(r"(name: '7-day moving average')[\s\S\W\w]*(data:[\s\S\W\w]*\d\])", jsonStr, re.IGNORECASE)
data = data.group(2).split('data:')[-1].strip().replace('[','').replace(']','').split(',')
Output:
print(data[-1])
148755
Better yet, we can pull out the dates too and make a dataframe:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import ast
url = 'https://www.worldometers.info/coronavirus/country/us/'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if "Highcharts.chart('graph-cases-daily'" in str(script):
jsonStr = str(script)
dates = re.search(r'(xAxis: {[\s\S\W\w]*)(categories: )(\[[\w\W\s\W]*\"\])', jsonStr)
dates = dates.group(3).replace('[','').replace(']','')
dates = ast.literal_eval(dates)
dates = [ x for x in dates]
data = re.search(r"(name: '7-day moving average')[\s\S\W\w]*(data:[\s\S\W\w]*\d\])", jsonStr, re.IGNORECASE)
data = data.group(2).split('data:')[-1].strip().replace('[','').replace(']','').split(',')
df = pd.DataFrame({'Date':dates, '7 Day Moving Average':data})
And to plot:
import matplotlib.pyplot as plt
df.iloc[1:]['7 Day Moving Average'].astype(int).plot(x ='Date', y='7 Day Moving Average', kind = 'line')
plt.show()
UPDATE:
To get each state, we grabbed the href for each of them then pulled out the data. I went ahead and combined all the tables and you can just query the 'State' column for a specific state:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import ast
url = 'https://www.worldometers.info/coronavirus/country/us/'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
states_list = []
rows = soup.find('table', {'id':'usa_table_countries_today'}).find_all('tr')
for row in rows:
if row.find_all('td'):
tds = row.find_all('td')
for data in tds:
if data.find('a', {'class':'mt_a'}):
href = data.find('a', {'class':'mt_a'})['href']
states_list.append(href)
states_list = [x for x in states_list]
df_dict = {}
for state in states_list:
print(state)
df_dict[state] = []
state_url = 'https://www.worldometers.info/' + state
response = requests.get(state_url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
for graph_type in ['cases','deaths']:
if "Highcharts.chart('graph-%s-daily'" %graph_type in str(script):
jsonStr = str(script)
dates = re.search(r'(xAxis: {[\s\S\W\w]*)(categories: )(\[[\w\W\s\W]*\"\])', jsonStr)
dates = dates.group(3).replace('[','').replace(']','')
dates = ast.literal_eval(dates)
dates = [ x for x in dates]
data = re.search(r"(name: '7-day moving average')[\s\S\W\w]*(data:[\s\S\W\w]*\d\])", jsonStr, re.IGNORECASE)
data = data.group(2).split('data:')[-1].strip().replace('[','').replace(']','').split(',')
df = pd.DataFrame({'Date':dates, '7 Day Moving Average - %s' %graph_type.title():data})
df_dict[state].append(df)
# Combine the tables
df_list = []
for state, tables in df_dict.items():
dfs = [df.set_index('Date') for df in tables]
temp_df = pd.concat(dfs, axis=1).reset_index(drop=False)
temp_df['State'] = state.split('/')[-2]
df_list.append(temp_df)
results = pd.concat(df_list, axis=0)
I was able to just scrape the page using BeautifulSoup. I got the area I want - finding the 7-day average - but I'm having difficulties trying to organize the data into a data frame. Ultimately, I just want the latest date, but I'm unsure about how to get there.
import requests
from bs4 import BeautifulSoup
url = "https://www.worldometers.info/coronavirus/usa/california/#graph-cases-daily"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
all_scripts = soup.find_all('script')

Scraping rating from Tripadvisor

I'm scraping the activities to do in Paris from TripAdvisor (https://www.tripadvisor.it/Attractions-g187147-Activities-c42-Paris_Ile_de_France.html).
The code that I've written works well, but I haven't still found a way to obtain the rating of each activity. The rating in Tripadvisor is represented from 5 rounds, I need to know how many of these rounds are colored.
I obtain nothing in the "rating" field.
Following the code:
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
wd.get("https://www.tripadvisor.it/Attractions-g187147-Activities-c42-Paris_Ile_de_France.html")
import pprint
detail_tours = []
for tour in list_tours:
url = tour.find_elements_by_css_selector("a")[0].get_attribute("href")
title = ""
reviews = ""
rating = ""
if(len(tour.find_elements_by_css_selector("._1gpq3zsA._1zP41Z7X")) > 0):
title = tour.find_elements_by_css_selector("._1gpq3zsA._1zP41Z7X")[0].text
if(len(tour.find_elements_by_css_selector("._7c6GgQ6n._22upaSQN._37QDe3gr.WullykOU._3WoyIIcL")) > 0):
reviews = tour.find_elements_by_css_selector("._7c6GgQ6n._22upaSQN._37QDe3gr.WullykOU._3WoyIIcL")[0].text
if(len(tour.find_elements_by_css_selector(".zWXXYhVR")) > 0):
rating = tour.find_elements_by_css_selector(".zWXXYhVR")[0].text
detail_tours.append({'url': url,
'title': title,
'reviews': reviews,
'rating': rating})
I would use BeautifulSoup in a way similar to the suggested code. (I would also recommend you study the structure of the html, but seeing the original code I don't think that's necessary.)
import requests
from bs4 import BeautifulSoup
import re
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}
resp = requests.get('https://www.tripadvisor.it/Attractions-g187147-Activities-c42-Paris_Ile_de_France.html', headers=header)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, 'lxml')
cards = soup.find_all('div', {'data-automation': 'cardWrapper'})
for card in cards:
rating = card.find('svg', {'class': 'zWXXYhVR'})
match = re.match('Punteggio ([0-9,]+)', rating.attrs['aria-label'])[1]
print(float(match.replace(',', '.')))
And a small bonus-info, the part in the link preceeded by oa (In the example below: oa60), indicates the starting offset, which runs in 30 result increments - So in case you want to change pages, you can change your link to include oa30, oa60, oa90, etc.: https://www.tripadvisor.it/Attractions-g187147-Activities-c42-oa60-Paris_Ile_de_France.html

Categories

Resources