web-scraping and pagination with python, csv, beautifulsoup and Pandas - python

This website https://aviation-safety.net/wikibase/ DB begins from year 1902 to 2022. The code presented here captures some years for misses some as well. The years before 1912 and the year after 2021 are not captured. I want to scrape All Accidents for each type of aircraft for all or by year(s). This webDB starts from https://aviation-safety.net/wikibase/dblist.php?Year=1902 and should end on https://aviation-safety.net/wikibase/dblist.php?Year=2022. Currently, the code dumps the results in .csv file, but it could also be in SQLite.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
# use a default looking header to cover my tracks in case they block requests that don't have "accept" and "user-agent" which sometimes happens
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')]) # get the maximum number of pages using "list comprehension", I get all the links at the bottom of the page ('a' tags) and the get the [href] for each, but split it on "=" making each a list, then get the last one ([-1]) and turn the text into an integer so I can get the max of all the integers ie the last page number
info = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for index,row in enumerate(table.find_all('tr',{'class':regex})):
if index == 0:
continue
acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
df= pd.DataFrame(info)
df.to_csv(f'{year}_aviation-safety.csv', encoding='utf-8-sig', index=False)
if __name__ == "__main__":
START = 1901
STOP = 2023
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)

Lmao, I wrote that code for someone on this site once before. I've edited to work for the missing years here:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
try:
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
try:
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')])
except:
pages = 1
info = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for row in table.find_all('tr',{'class':regex}):
acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
return info
except Exception as e:
print(e, url)
return []
if __name__ == "__main__":
START = 2022
STOP = 2023
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)
list_of_dicts= list(final_list)
flat_list = [item for sublist in list_of_dicts for item in sublist] #convert list of lists into one big list
df= pd.DataFrame(flat_list)
df.to_csv('all_years_aviation-safety.csv',index=False)

Related

Multiple values against the same tag not scraping

I'm getting no values for my "Number of Rooms" and "Room" search.
https://www.zoopla.co.uk/property/uprn/906032139/
I can see here that I should be returning something but not getting anything.
Can anyone possibly point me in the right direction of how to solve this? I am not even sure what to search for as it's not erroring. I thought it would put all the data in and then I would need to figure out a way to seperate it. Do I need to maybe scrape it into a dictionary?
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import matplotlib as plt
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://google.co.uk",
"DNT": "1"
}
page = 1
addresses = []
while page != 2:
url = f"https://www.zoopla.co.uk/house-prices/edinburgh/?pn={page}"
print(url)
response = requests.get(url, headers=headers)
print(response)
html = response.content
soup = bs(html, "lxml")
time.sleep(1)
for address in soup.find_all("div", class_="c-rgUPM c-rgUPM-pnwXf-hasUprn-true"):
details = {}
# Getting the address
details["Address"] = address.h2.get_text(strip=True)
# Getting each addresses unique URL
scotland_house_url = f'https://www.zoopla.co.uk{address.find("a")["href"]}'
details["URL"] = scotland_house_url
scotland_house_url_response = requests.get(
scotland_house_url, headers=headers)
scotland_house_soup = bs(scotland_house_url_response.content, "lxml")
# Lists status of the property
try:
details["Status"] = [status.get_text(strip=True) for status in scotland_house_soup.find_all(
"span", class_="css-10o3xac-Tag e164ranr11")]
except AttributeError:
details["Status"] = ""
# Lists the date of the status of the property
try:
details["Status Date"] = [status_date.get_text(
strip=True) for status_date in scotland_house_soup.find_all("p", class_="css-1jq4rzj e164ranr10")]
except AttributeError:
details["Status Date"] = ""
# Lists the value of the property
try:
details["Value"] = [value.get_text(strip=True).replace(",", "").replace(
"£", "") for value in scotland_house_soup.find_all("p", class_="css-1x01gac-Text eczcs4p0")]
except AttributeError:
details["Value"] = ""
# Lists the number of rooms
try:
details["Number of Rooms"] = [number_of_rooms.get_text(strip=True) for number_of_rooms in scotland_house_soup.find_all(
"p", class_="css-82kmy1 e13gx5i3")]
except AttributeError:
details["Number of Rooms"] = ""
# Lists type of room
try:
details["Room"] = [room.get_text(strip=True) for room in scotland_house_soup.find_all(
"span", class_="css-1avcdf2 e13gx5i4")]
except AttributeError:
details["Room"] = ""
addresses.append(details)
page = page + 1
for address in addresses[:]:
print(address)
print(response)
Selecting by class_="css-1avcdf2 e13gx5i4" seems brittle, the class might change all the time. Try different CSS selector:
import requests
from bs4 import BeautifulSoup
url = "https://www.zoopla.co.uk/property/uprn/906032139/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
tag = soup.select_one('#timeline p:has(svg[data-testid="bed"]) + p')
no_beds, beds = tag.get_text(strip=True, separator=" ").split()
print(no_beds, beds)
Prints:
1 bed
If you want all types of rooms:
for detail in soup.select("#timeline p:has(svg[data-testid]) + p"):
n, type_ = detail.get_text(strip=True, separator="|").split("|")
print(n, type_)
Prints:
1 bed
1 bath
1 reception

Web-Scraping using BeautifulSoup (missing values when scraping)

I have been trying to webscrape a realtor website using BeautifulSoup and encountered 2 difficulties that I cannot seem to fix.
Difficulties:
When I run my code below, I am missing some date values. The dataframe should hold 68 rows of data scraped from the first page. The description and title scrapes return 68 rows, but the date scrape returns 66. I don't get N/A values returned if its missing either. Does anyone have an idea why this is? When I inspected the website elements it had the same tags, except it is listed as VIP or Special (promotion) apartments.
Secondly, I cannot seem to figure out how to scrape meta itemprop tags. I keep getting blank values when I use:
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):
Thank you in advance for any assistance you could provide.
Python Code:
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd
def get_headers():
#Headers
headers={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language':'en-US,en;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
return headers
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination
#Make list holder
title = []
description = []
date = []
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']
for x in urls:
count=1
y=x
while(count < 2): # will get only 1st page
print(x)
req = Request(x, headers=get_headers()) #req all headers
htmlfile = urlopen(req)
htmltext = htmlfile.read()
soup = bsoup(htmltext,'html.parser')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('a', attrs={'class':'announcement-block__title'}):
text = tag2.get_text().strip()
if len(text) > 0:
title.append(text)
else:
title.append('N/A')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__description'}):
text = tag2.get_text().strip()
if len(text) > 0:
description.append(text)
else:
description.append('N/A')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):
text = tag2.get_text().strip()
if len(text) > 0:
date.append(text)
else:
date.append('N/A')
# Go to next page
count=count+1
page = '?page='+str(count)
x=y+page
data_frame = pd.DataFrame(list(zip(title,description,date)),columns=['Title', 'Description', 'Date'])
You get 66 items because your date[] contains only 66 elements, therefore, you need to check all three fields at once in one for loop. Your if else checks do nothing as there are no announcement-block__date divs with empty content on the page.
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd
def get_headers():
#Headers
headers={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language':'en-US,en;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
return headers
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination
#Make list holder
info = {
'title': [],
'description': [],
'date': []
}
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']
for x in urls:
count=1
y=x
while(count < 2): # will get only 1st page
print(x)
req = Request(x, headers=get_headers()) #req all headers
htmlfile = urlopen(req)
htmltext = htmlfile.read()
soup = bsoup(htmltext,'html.parser')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
title = tag.find('a', attrs={'class':'announcement-block__title'})
description = tag.find('div', attrs={'class':'announcement-block__description'})
date = tag.find('div', attrs={'class':'announcement-block__date'})
info['title'].append(title.get_text().strip() if title else 'N/A')
info['description'].append(description.get_text().strip() if description else 'N/A')
info['date'].append(date.get_text().strip() if date else 'N/A')
# Go to next page
count=count+1
page = '?page='+str(count)
x=y+page
data_frame = pd.DataFrame(list(zip(info['title'], info['description'], info['date'])),columns=['Title', 'Description', 'Date'])
print(len(info['title']), len(info['description']), len(info['date']))
print(data_frame)
About your second question, a similar question has already been answered here

Creating a dataframe from a dictionary is giving me a could not broadcast error

I am trying to create a data frame from a dictionary I have and it gives me an error that says:
> ValueError: could not broadcast input array from shape (3) into shape
> (1)
Here is the code:
import requests
from bs4 import BeautifulSoup
from requests.api import request
from selenium import webdriver
from bs4 import Tag, NavigableString
baseurl = "https://www.olx.com.eg/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
product_links = []
for x in range(1,13):
r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers)
soup = BeautifulSoup(r.content, "lxml")
product_list = soup.findAll("div", class_="ads__item")
for item in product_list:
for link in item.findAll("a",href=True):
product_links.append(link['href'])
for thing in product_links:
if '#' in product_links: product_links.remove('#')
# test_link = 'https://www.olx.com.eg/ad/-IDcjqyP.html'
for link in product_links:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
job_title = soup.find('h1',class_="brkword")
job_location = soup.find('strong',class_="c2b")
job_date = soup.find('span',class_="pdingleft10 brlefte5")
try:
seniority = soup.find_all('td',class_='value')[0].text.strip()
except:
print("")
try:
full_or_part = soup.find_all('td',class_='value')[1].text.strip()
except:
print("")
try:
education_level = soup.find_all('td',class_='value')[2].text.strip()
except:
print("")
try:
sector = soup.find_all('td',class_='value')[3].text.strip()
except:
print("")
description = soup.find_all('p',class_='pding10')
df = {
"Job Title" : job_title,
"Job Location" : job_location,
"Post Date" : job_date,
"Seniority Level" : seniority,
"Full or Part time" : full_or_part,
"Educational Level" : education_level,
"Sector" : sector,
"Job Description" : description
}
job_data = pd.DataFrame(df)
Please tell me how I can transform the data I have into a data frame so I can export it into a csv
first of all I was trying to to scrape this jobs website and it scraped it successfully returning 500 jobs in the dictionary but I was unfortunately not able to transform the code into a dataframe, so later on i can export that out to a csv file, so i can do some analysis on it
To create dataframe from the job ads, you can try next example (some column names needs to be renamed from arabic to english though):
import requests
import pandas as pd
from bs4 import BeautifulSoup
baseurl = "https://www.olx.com.eg/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
product_links = []
for x in range(1, 2): # <-- increase the range here
r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers)
soup = BeautifulSoup(r.content, "lxml")
product_list = soup.findAll("div", class_="ads__item")
for item in product_list:
for link in item.findAll("a", href=True):
if link["href"] != "#":
product_links.append(link["href"])
all_data = []
for link in product_links:
print(f"Getting {link} ...")
soup = BeautifulSoup(requests.get(link, headers=headers).content, "lxml")
d = {}
job_title = soup.find("h1").get_text(strip=True)
job_location = soup.find("strong", class_="c2b")
job_date = soup.find("span", class_="pdingleft10 brlefte5")
d["title"] = job_title
d["location"] = job_location.get_text(strip=True) if job_location else "N/A"
d["date"] = job_date.get_text(strip=True) if job_date else "N/A"
for table in soup.select("table.item"):
d[table.th.get_text(strip=True)] = table.td.get_text(strip=True)
all_data.append(d)
job_data = pd.DataFrame(all_data)
print(job_data)
job_data.to_csv("data.csv", index=False)
Creates data.csv (screenshot from LibreOffice):

How do I web scrape Worldometer charts?

Our organization is using Worldometers for COVID-19 data. I'm able to scrape the page state data, but our leaders want the 7-day moving average for new cases and deaths. To do this manually, you have to click on the 7-day moving average button and hover over today's date. Is there an automated method or module that is available to the public?
Link I can web scrape: https://www.worldometers.info/coronavirus/country/us/
Data I need in the images below.
You can use regex to pull that out:
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.worldometers.info/coronavirus/country/us/'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if "Highcharts.chart('graph-cases-daily'" in str(script):
jsonStr = str(script)
data = re.search(r"(name: '7-day moving average')[\s\S\W\w]*(data:[\s\S\W\w]*\d\])", jsonStr, re.IGNORECASE)
data = data.group(2).split('data:')[-1].strip().replace('[','').replace(']','').split(',')
Output:
print(data[-1])
148755
Better yet, we can pull out the dates too and make a dataframe:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import ast
url = 'https://www.worldometers.info/coronavirus/country/us/'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if "Highcharts.chart('graph-cases-daily'" in str(script):
jsonStr = str(script)
dates = re.search(r'(xAxis: {[\s\S\W\w]*)(categories: )(\[[\w\W\s\W]*\"\])', jsonStr)
dates = dates.group(3).replace('[','').replace(']','')
dates = ast.literal_eval(dates)
dates = [ x for x in dates]
data = re.search(r"(name: '7-day moving average')[\s\S\W\w]*(data:[\s\S\W\w]*\d\])", jsonStr, re.IGNORECASE)
data = data.group(2).split('data:')[-1].strip().replace('[','').replace(']','').split(',')
df = pd.DataFrame({'Date':dates, '7 Day Moving Average':data})
And to plot:
import matplotlib.pyplot as plt
df.iloc[1:]['7 Day Moving Average'].astype(int).plot(x ='Date', y='7 Day Moving Average', kind = 'line')
plt.show()
UPDATE:
To get each state, we grabbed the href for each of them then pulled out the data. I went ahead and combined all the tables and you can just query the 'State' column for a specific state:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import ast
url = 'https://www.worldometers.info/coronavirus/country/us/'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
states_list = []
rows = soup.find('table', {'id':'usa_table_countries_today'}).find_all('tr')
for row in rows:
if row.find_all('td'):
tds = row.find_all('td')
for data in tds:
if data.find('a', {'class':'mt_a'}):
href = data.find('a', {'class':'mt_a'})['href']
states_list.append(href)
states_list = [x for x in states_list]
df_dict = {}
for state in states_list:
print(state)
df_dict[state] = []
state_url = 'https://www.worldometers.info/' + state
response = requests.get(state_url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
for graph_type in ['cases','deaths']:
if "Highcharts.chart('graph-%s-daily'" %graph_type in str(script):
jsonStr = str(script)
dates = re.search(r'(xAxis: {[\s\S\W\w]*)(categories: )(\[[\w\W\s\W]*\"\])', jsonStr)
dates = dates.group(3).replace('[','').replace(']','')
dates = ast.literal_eval(dates)
dates = [ x for x in dates]
data = re.search(r"(name: '7-day moving average')[\s\S\W\w]*(data:[\s\S\W\w]*\d\])", jsonStr, re.IGNORECASE)
data = data.group(2).split('data:')[-1].strip().replace('[','').replace(']','').split(',')
df = pd.DataFrame({'Date':dates, '7 Day Moving Average - %s' %graph_type.title():data})
df_dict[state].append(df)
# Combine the tables
df_list = []
for state, tables in df_dict.items():
dfs = [df.set_index('Date') for df in tables]
temp_df = pd.concat(dfs, axis=1).reset_index(drop=False)
temp_df['State'] = state.split('/')[-2]
df_list.append(temp_df)
results = pd.concat(df_list, axis=0)
I was able to just scrape the page using BeautifulSoup. I got the area I want - finding the 7-day average - but I'm having difficulties trying to organize the data into a data frame. Ultimately, I just want the latest date, but I'm unsure about how to get there.
import requests
from bs4 import BeautifulSoup
url = "https://www.worldometers.info/coronavirus/usa/california/#graph-cases-daily"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
all_scripts = soup.find_all('script')

Scrapy Crawler: Scrape lists from pages

Objective: Crawl this page
https://www.cardplayer.com/poker-tournaments/monthly/2021/06
And then get a list of all the tournaments on each page.
here is my code
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import pandas as pd
mydf = pd.DataFrame()
class TournamentsSpider(CrawlSpider):
name = 'tournaments'
allowed_domains = ['www.cardplayer.com']
start_urls = ['https://www.cardplayer.com/poker-tournaments/monthly/2021/06']
rules = (
Rule(LinkExtractor(restrict_xpaths='/html/body/div[5]/div/div[2]/div[2]/div[3]/div/table/tbody/tr/td[2]/a'),
callback='parse_item', follow=True),
)
def parse_item(self, response):
# I'm aware that some of the pages have two tables(I was thinking an if statement on the length of response and then running for table 1 on 1 table pages and table 2 on 2 table pages
for series in response.xpath('/html/body/div[5]/div/div[2]/div[3]/table/tbody'):
mydf["Event"] = series.xpath('/html/body/div[5]/div/div[2]/div[3]/table/tbodytr/td[1]/a/text()')
mydf["start"] = series.xpath('.//tr/td[2]/text()')
mydf["days"] = series.xpath('.//tr/td[3]/text()')
mydf["buyin"] = series.xpath('.//tr/td[4]/text()')
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(TournamentsSpider)
process.start()
print(mydf)
I can see the crawler finds all the URLs but the output only comes back for just 1 page so I'm doing something wrong.
Here is how i attempted doing this using bs4, just need to enter the number of years your wanting to collect.
# Get Product Page Links
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
baseurl = 'https://www.cardplayer.com/poker-tournaments/monthly/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
Tournaments = []
def GetPageData(url):
#Get singular page info
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
# Get all tr elements with empty class
productlist = soup.find_all('tr', class_='')
for i, item in enumerate(productlist):
# Skip first row of table titles
if (i != 0):
# remove spaces
RawTournamentInfo = str(item.text).strip()
# splits into list by new lines
RawTournamentInfo = RawTournamentInfo.splitlines()
# Create empty in strings
Date = ''
Name = ''
Location = ''
# had to loop of list, forsome reason not allowing direct calling
for i, item in enumerate(RawTournamentInfo):
if i == 0: Date = item
if i == 1: Name = item
if i == 2: Location = item
# Creating object and saving to list
if (Date != "Dates") and (Date != 'No tournament series found.'):
print('Added: ', Name)
tournament = {
'date': Date,
'name': Name,
'location': Location
}
Tournaments.append(tournament)
r.close()
def GetTournaments(yearsToCollect):
#Get Current Year/Month
today = datetime.today()
currentMonth = today.month
currentYear = today.year
for year in range(yearsToCollect):
#Finish current Year
if (year == 0):
for i in range(12 - currentMonth):
GetPageData(baseurl + str(currentYear) + '/' + str(currentMonth + i))
#All other years
else:
for i in range(12):
GetPageData(baseurl + str(currentYear + year) + '/' + str(i))
# Save to .xlsx
Tournamentsdf = pd.DataFrame(Tournaments)
Tournamentsdf.to_excel('Tournaments.xlsx', index=False)
if __name__ == "__main__":
yearsToCollect = 2
GetTournaments(yearsToCollect)

Categories

Resources