Our organization is using Worldometers for COVID-19 data. I'm able to scrape the page state data, but our leaders want the 7-day moving average for new cases and deaths. To do this manually, you have to click on the 7-day moving average button and hover over today's date. Is there an automated method or module that is available to the public?
Link I can web scrape: https://www.worldometers.info/coronavirus/country/us/
Data I need in the images below.
You can use regex to pull that out:
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.worldometers.info/coronavirus/country/us/'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if "Highcharts.chart('graph-cases-daily'" in str(script):
jsonStr = str(script)
data = re.search(r"(name: '7-day moving average')[\s\S\W\w]*(data:[\s\S\W\w]*\d\])", jsonStr, re.IGNORECASE)
data = data.group(2).split('data:')[-1].strip().replace('[','').replace(']','').split(',')
Output:
print(data[-1])
148755
Better yet, we can pull out the dates too and make a dataframe:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import ast
url = 'https://www.worldometers.info/coronavirus/country/us/'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if "Highcharts.chart('graph-cases-daily'" in str(script):
jsonStr = str(script)
dates = re.search(r'(xAxis: {[\s\S\W\w]*)(categories: )(\[[\w\W\s\W]*\"\])', jsonStr)
dates = dates.group(3).replace('[','').replace(']','')
dates = ast.literal_eval(dates)
dates = [ x for x in dates]
data = re.search(r"(name: '7-day moving average')[\s\S\W\w]*(data:[\s\S\W\w]*\d\])", jsonStr, re.IGNORECASE)
data = data.group(2).split('data:')[-1].strip().replace('[','').replace(']','').split(',')
df = pd.DataFrame({'Date':dates, '7 Day Moving Average':data})
And to plot:
import matplotlib.pyplot as plt
df.iloc[1:]['7 Day Moving Average'].astype(int).plot(x ='Date', y='7 Day Moving Average', kind = 'line')
plt.show()
UPDATE:
To get each state, we grabbed the href for each of them then pulled out the data. I went ahead and combined all the tables and you can just query the 'State' column for a specific state:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import ast
url = 'https://www.worldometers.info/coronavirus/country/us/'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
states_list = []
rows = soup.find('table', {'id':'usa_table_countries_today'}).find_all('tr')
for row in rows:
if row.find_all('td'):
tds = row.find_all('td')
for data in tds:
if data.find('a', {'class':'mt_a'}):
href = data.find('a', {'class':'mt_a'})['href']
states_list.append(href)
states_list = [x for x in states_list]
df_dict = {}
for state in states_list:
print(state)
df_dict[state] = []
state_url = 'https://www.worldometers.info/' + state
response = requests.get(state_url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
for graph_type in ['cases','deaths']:
if "Highcharts.chart('graph-%s-daily'" %graph_type in str(script):
jsonStr = str(script)
dates = re.search(r'(xAxis: {[\s\S\W\w]*)(categories: )(\[[\w\W\s\W]*\"\])', jsonStr)
dates = dates.group(3).replace('[','').replace(']','')
dates = ast.literal_eval(dates)
dates = [ x for x in dates]
data = re.search(r"(name: '7-day moving average')[\s\S\W\w]*(data:[\s\S\W\w]*\d\])", jsonStr, re.IGNORECASE)
data = data.group(2).split('data:')[-1].strip().replace('[','').replace(']','').split(',')
df = pd.DataFrame({'Date':dates, '7 Day Moving Average - %s' %graph_type.title():data})
df_dict[state].append(df)
# Combine the tables
df_list = []
for state, tables in df_dict.items():
dfs = [df.set_index('Date') for df in tables]
temp_df = pd.concat(dfs, axis=1).reset_index(drop=False)
temp_df['State'] = state.split('/')[-2]
df_list.append(temp_df)
results = pd.concat(df_list, axis=0)
I was able to just scrape the page using BeautifulSoup. I got the area I want - finding the 7-day average - but I'm having difficulties trying to organize the data into a data frame. Ultimately, I just want the latest date, but I'm unsure about how to get there.
import requests
from bs4 import BeautifulSoup
url = "https://www.worldometers.info/coronavirus/usa/california/#graph-cases-daily"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
all_scripts = soup.find_all('script')
Related
I'm developing a web scraping to collect some information from AllMusic. However, I am having difficulties to correctly return information when there is more than one option inside the tag (e.g. href).
Question: I need to return the first music genre for each artist. In the case of one value per artist, my code works. However, in situations with more than one music genre, I'm not able to select just the first one.
Here is the code created:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
artists =['Alexander 23', 'Alex & Sierra', 'Tion Wayne', 'Tom Cochrane','The Waked']
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
performer = []
links = []
genre = []
for artist in artists:
url= urllib.request.urlopen("https://www.allmusic.com/search/artist/" + urllib.parse.quote(artist))
soup = BeautifulSoup(requests.get(url.geturl(), headers=headers).content, "html.parser")
div = soup.select("div.name")[0]
link = div.find_all('a')[0]['href']
links.append(link)
for l in links:
soup = BeautifulSoup(requests.get(l, headers=headers).content, "html.parser")
divGenre= soup.select("div.genre")[0]
genres = divGenre.find('a')
performer.append(artist)
genre.append(genres.text)
df = pd.DataFrame(zip(performer, genre, links), columns=["artist", "genre", "link"])
df
Hopfully understand your question right - Main issue is that you iterate the links inside your for-loop and that causes the repetition.
May change your strategy, try to get all information in one iteration and store them in a more structured way.
Example
import requests
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
artists =['Alexander 23', 'Alex & Sierra', 'Tion Wayne', 'Tom Cochrane','The Waked']
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
data = []
for artist in artists:
url= urllib.request.urlopen("https://www.allmusic.com/search/artist/" + urllib.parse.quote(artist))
soup = BeautifulSoup(requests.get(url.geturl(), headers=headers).content, "html.parser")
link = soup.select_one("div.name a").get('href')
soup = BeautifulSoup(requests.get(link, headers=headers).content, "html.parser")
data.append({
'artist':artist,
'genre':soup.select_one("div.genre a").text,
'link':link
})
print(pd.DataFrame(data).to_markdown(index=False))
Output
artist
genre
link
Alexander 23
Pop/Rock
https://www.allmusic.com/artist/alexander-23-mn0003823464
Alex & Sierra
Folk
https://www.allmusic.com/artist/alex-sierra-mn0003280540
Tion Wayne
Rap
https://www.allmusic.com/artist/tion-wayne-mn0003666177
Tom Cochrane
Pop/Rock
https://www.allmusic.com/artist/tom-cochrane-mn0000931015
The Waked
Electronic
https://www.allmusic.com/artist/the-waked-mn0004025091
In this URL https://doc8643.com/aircrafts I want to scrape all rows.
Then for each individual row, for example https://doc8643.com/aircraft/A139
I want to scrape these three areas of data
<table class="table centered-table">
<h4>Manufacturers</h4>
<h4>Technical Data</h4>
Can this is done in python?
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://doc8643.com/aircrafts'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
writer.writerow([c.text if c.text else '' for c in row.select('h3')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
else:
break
You should create function which get value c.text (ie, A139) and creates full url like https://doc8643.com/aircraft/A139 and runs Request or requests and BeautifulSoup to get all needs data
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# ... scrape details and put in list `results` ...
return results
and run it in your loop
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
The biggest problem is to scrape details.
For some details it needs to scrape dl and next all dt and dd and use zip() to group in pairs.
Something like
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
but this need more code - and I skip this part.
Minimal working code
EDIT: I added url = 'https://doc8643.com' + url
import csv
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# --- functions ---
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
# --- main ---
url = 'https://doc8643.com/aircrafts'
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["data1", "data2", "data3", "etc..."])
while True:
print('url:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
url = 'https://doc8643.com' + url
else:
break
BTW:
Maybe it would be better to keep results as dictionary
results[dt.string] = [dd.string]
I am doing web scraping in Python with BeautifulSoup and wondering if I there is a way of getting the value of a cell when it has no id. The code is as below:
from bs4 import BeautifulSoup
import requests
import time
import datetime
URL = "https://www.amazon.co.uk/Got-Data-MIS-Business-Analyst/dp/B09F319PK2/ref=sr_1_1?keywords=funny+got+data+mis+data+systems+business+analyst+tshirt&qid=1636481904&qsid=257-9827493-6142040&sr=8-1&sres=B09F319PK2%2CB09F33452D%2CB08MCBFLHC%2CB07Y8Z4SF8%2CB07GJGXY7P%2CB07Z2DV1C2%2CB085MZDMZ8%2CB08XYL6GRM%2CB095CXJ226%2CB08JDMYMPV%2CB08525RB37%2CB07ZDNR6MP%2CB07WL5JGPH%2CB08Y67YF63%2CB07GD73XD8%2CB09JN7Z3G2%2CB078W9GXJY%2CB09HVDRJZ1%2CB07JD7R6CB%2CB08JDKYR6Q&srpt=SHIRT"
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 14092.77.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.107 Safari/537.36"}
page = requests.get(URL, headers = headers)
soup1 = BeautifulSoup(page.content, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
title = soup2.find(id="productTitle").get_text()
price = soup2.find(id="priceblock_ourprice").get_text()
print(title)
print(price)
For this page, you have to select the garment size before the price is displayed. We can get the price from the dropdown list of sizes which is a SELECT with id = "dropdown_selected_size_name"
First let's get a list of the options in the SELECT dropdown:
options = soup2.find(id='variation_size_name').select('select option')
Then we can get the price say for size 'Large'
for opt in options:
if opt.get('data-a-html-content', '') == 'Large':
print(opt['value'])
or a little more succinctly:
print([opt['value'] for opt in options if opt.get('data-a-html-content', '') == 'Large'][0])
I am writing some Python to scrape lottery numbers and a other columns in a table.
The issue I have is trying to extract January 2001 in the following January 2001 using Python and BeautifulSoup.
The code I have created so far
import requests
from bs4 import BeautifulSoup
URL = "https://www.lotterysearch.org/results/2001"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15"
}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.find("table", {"style": "width:100%"})
# Get each table row 'tr'
for row in table.find_all("tr"):
cells = row.findAll("td")
# print(row.find("td").find("a"))
draw_year = cells[0].find("a")
draw_date = cells[0].find(text=True)
# draw_date = cells[0].find(text=True)
winning_numbers = cells[1].find(text=True)
jackpot = cells[3].find(text=True)
draw_number = cells[4].find(text=True)
print(draw_year)
The results that get printed are
January 2001
I could do some sub stringing to pull out the January 2001 but want to find the correct method for doing so.
I made this quick change. Please let me know if it is helpful. I think it prints a relative URL but you can combine it with the base URL.
draw_year = cells[0].find("a", href=True)
if draw_year is not None:
print(draw_year['href'])
I got it now. Add this to the end. I added the if statement because you get a None type in the output. Is this how you want it?
if draw_year is not None:
print(draw_year.get_text()
Late answer, but you can also use:
import requests
from bs4 import BeautifulSoup
h = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15"}
u = "https://www.lotterysearch.org/results/2001"
html = requests.get(u, headers=h).text
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", {"style": "width:100%"})
for row in table.find_all("tr"):
cells = row.findAll("td")
draw_year = cells[0].findAll("a")
if not len(draw_year) == 1: continue # skip 1st tr that only contains Date
draw_year = draw_year[0].text
draw_date = cells[0].find(text=True)
winning_numbers = cells[1].find(text=True)
jackpot = cells[3].find(text=True)
draw_number = cells[4].find(text=True)
print(draw_year)
January 2001
January 2001
January 2001
January 2001
January 2001
January 2001
January 2001
...
Demo
So the issue was that I was not catering for None when trying to extract the .text so I tested for None if type(draw_year) != type(None)
import requests
from bs4 import BeautifulSoup
URL = "https://www.lotterysearch.org/results/2001"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15"
}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.find("table", {"style": "width:100%"})
# Get each table row 'tr'
for row in table.find_all("tr"):
cells = row.findAll("td")
draw_year = cells[0].find("a")
draw_date = cells[0].find(text=True)
# draw_date = cells[0].find(text=True)
winning_numbers = cells[1].find(text=True)
jackpot = cells[3].find(text=True)
draw_number = cells[4].find(text=True)
if type(draw_year) != type(None):
print(draw_year.text)
I want to search for different company names on the website. Website link: https://www.firmenwissen.de/index.html
On this website, I want to use the search engine and search companies. Here is the code I am trying to use:
from bs4 import BeautifulSoup as BS
import requests
import re
companylist = ['ABEX Dachdecker Handwerks-GmbH']
url = 'https://www.firmenwissen.de/index.html'
payloads = {
'searchform': 'UFT-8',
'phrase':'ABEX Dachdecker Handwerks-GmbH',
"mainSearchField__button":'submit'
}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
html = requests.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'html.parser')
link_list= []
links = soup.findAll('a')
for li in links:
link_list.append(li.get('href'))
print(link_list)
This code should bring me the next page with company information. But unfortunately, it returns only the home page. How can I do this?
Change your initial url you are doing search for. Grab the appropriate hrefs only and add to a set to ensure no duplicates (or alter selector to return only one match if possible); add those items to a final set for looping to ensure only looping required number of links. I have used Session on assumption you will repeat for many companies.
Iterate over the set using selenium to navigate to each company url and extract whatever info you need.
This is an outline.
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = set()
## searches section; gather into set
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLinks = {baseUrl + item['href'] for item in soup.select("[href*='firmeneintrag/']")}
# print(soup.select_one('.fp-result').text)
finalLinks = finalLinks.union(companyLinks)
for item in finalLinks:
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()
Just the first links:
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH', 'aktive Stuttgarter']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = []
## searches section; add to list
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLink = baseUrl + soup.select_one("[href*='firmeneintrag/']")['href']
finalLinks.append(companyLink)
for item in set(finalLinks):
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()