Multiple requests to Scrape different pages is giving the same result - python

I want to scrape the ICC cricket website and find the rankings of batsmen on a particular date. The problem is that the result that I'm getting is the same for all the dates. The scraping is giving me the list of the most recent rankings rather than the ranking at a particular date. The code is given below. Can someone tell me why this is happening/ a solution for the same?
I feel that the problem is that beautifulsoup is not letting the page load completely which is in turn giving false information as the information required is received after applying filters on the website.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import datetime
import os
date_list = pd.date_range(start = "1971-02-01", end=datetime.date.today(), freq='1d')
def get_batsmen(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_bowler(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_allrounder(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/all-rounder?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
#Storing the data into multiple csvs
for date in date_list:
year = date.year
month = date.month
day = date.day
newpath = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}'
if not os.path.exists(newpath):
os.makedirs(newpath)
newpath1 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}'
if not os.path.exists(newpath1):
os.makedirs(newpath1)
newpath2 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}\{day}'
if not os.path.exists(newpath2):
os.makedirs(newpath2)
get_batsmen(date).to_csv(newpath2+'/batsmen.csv')
get_bowler(date).to_csv(newpath2+'/bowler.csv')
get_allrounder(date).to_csv(newpath2+'/allrounder.csv')

In case the website that you're scraping is interactive it can be good to look at Selenium as scraping package instead of bs4 so javascript is executed.

Related

how to match data from a linked pages together by web scraping

i'm trying to scrap some data from a web site and i have an issue in matching the data from every subpage to the data of the main page
for Expample: the main page have a country name "Alabama Trucking Companies" and when i enter to it link, i'll found some cities(Abbeville, Adamsville,...etc), i need to clarify every city details (city name and city link) with it's country name
country names that i scraped from the main page:
city names that i scraped from the sub page:
the below code that i used is extracting the data from the main and sub pages individually without matching them to other, So how can i solve this issue please.
The code that i've used:-
start_time = datetime.now()
url = 'https://www.quicktransportsolutions.com/carrier/usa-trucking-companies.php'
page_country = requests.get(url).content
soup_country = BeautifulSoup(page_country, 'lxml')
countries = soup_country.find('div',{'class':'col-xs-12 col-sm-9'})
countries_list = []
country_info = countries.find_all('div',{'class':'col-md-4 column'})
for i in country_info:
title_country = i.text.strip()
href_country = i.find('a', href=True)['href']
countries_list.append({'Country Title':title_country, 'Link':(f'https://www.quicktransportsolutions.com//carrier//{href_country}')})
countries_links = []
for i in pd.DataFrame(countries_list)['Link']:
page_city = requests.get(i).content
soup_city = BeautifulSoup(page_city, 'lxml')
city = soup_city.find('div',{'align':'center','class':'table-responsive'})
countries_links.append(city)
cities_list = []
for i in countries_links:
city_info = i.find_all('td',"")
for i in city_info:
title_city = i.text.strip()
try:
href_city = i.find('a', href=True)['href']
except:
continue
cities_list.append({'City Title':title_city,'City Link':href_city})
end_time = datetime.now()
print(f'Duration: {end_time - start_time}')
df = pd.DataFrame(cities_list)
df = df.loc[df['City Link']!= '#'].drop_duplicates().reset_index(drop=True)
df
The expected data to see for every country is the below:-
Instead of parsing all of the state links and adding them to a list prior to crawling each of the city pages, what you can do is parse each states extract their link, then immediately follow the link to get all of the cities for that state before moving on the the next state, and then append all the information to one master list at one time.
For example:
start_time = datetime.now()
url = 'https://www.quicktransportsolutions.com/carrier/usa-trucking-companies.php'
page_country = requests.get(url).content
soup_country = BeautifulSoup(page_country, 'lxml')
countries = soup_country.find('div',{'class':'col-xs-12 col-sm-9'})
data_list = []
country_info = countries.find_all('div',{'class':'col-md-4 column'})
for i in country_info:
title_country = i.text.strip()
href_country = i.find('a', href=True)['href']
link = f'https://www.quicktransportsolutions.com/carrier/{href_country}'
page_city = requests.get(link).content
soup_city = BeautifulSoup(page_city, 'lxml')
city = soup_city.find('div',{'align':'center','class':'table-responsive'})
city_info = city.find_all('td',"")
for i in city_info:
title_city = i.text.strip()
try:
href_city = i.find('a', href=True)['href']
except:
continue
row = {
'Country Title':title_country,
'Link':link,
'City Title':title_city,
'City Link':href_city
}
data_list.append(row)
end_time = datetime.now()
print(f'Duration: {end_time - start_time}')
df = pd.DataFrame(data_list)
df = df.loc[df['City Link']!= '#'].drop_duplicates().reset_index(drop=True)

Scraping a website with multiple pages and not getting the desired amount of output with BeautifulSoup

I am trying to scrape a website with multiple pages(50) and get specific information but when i run my code, my output is just 7 items when there are over 20000 on the website and I found out that my code is Scraping just the first page. Please I don't know what else to do, I'd appreciate your help. Thank you
import requests
from bs4 import BeautifulSoup
import pandas as pd
name_selector = ".name"
old_price_selector = ".old"
new_price_selector = ".prc"
for i in range(1,50,1):
url = "https://www.jumia.com.ng/phones-tablets/samsung/?q=samsung+phones&page=" +str(i)+ "#catalog-listing"
website = requests.get(url)
soup = BeautifulSoup(website.content, 'html.parser')
name = soup.select(name_selector)
old_price = soup.select(old_price_selector)
new_price = soup.select(new_price_selector)
discount = soup.findAll("div", {"class": "bdg _dsct _sm"})
data = []
for names, old_prices, new_prices, discounts in zip(name, old_price, new_price, discount):
dic = {"Phone Names": names.getText(),"New Prices": old_prices.getText(),"Old Prices": new_prices.getText(),"Discounts": discounts.getText()}
data.append(dic)
df = pd.DataFrame(data)
You have to create data = [] before first loop. That's all.
data = []
for i in range(1, 50):
# ... code ...
Your code creates new data = [] in every loop and it removes previous content - so you get data only from last page.

Trying to export data taken from HTML using beautiful soup

I'm trying to extract information from the HTML texts that I get from URLs that I create from a For loop and then use beautiful soup.
I get to isolate the information correctly but when I'm trying to export the data I get an error message "All arrays must be of the same length"
weblink = []
filing_type = []
company_name = []
date = []
#Importing file
df = pd.read_csv('Downloads\Dropped_Companies.csv')
#Getting companie's names into list
companies_column=list(df.columns.values)[4]
name_ = df[companies_column].tolist()
#Formatting company's names for creating URLs
for CompanyName in name_:
company_name.append(CompanyName.lower().replace(" ",'_'))
company_name
for item in range(0, len(company_name)):
link = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&company=' + company_name[item] + '&type=10-K&dateb=&owner=exclude&count=100'
#Getting the HTML text
headers = random.choice(headers_list)
r = requests.Session()
r.headers = headers
html = r.get(link).text
#Calling beautiful soup for better HTML text
soup = bs.BeautifulSoup(html)
tet_ = soup.find_all("a", id = "documentsbutton")
#Get the links
for link in tet_:
weblink.append('https://www.sec.gov' + link.get('href'))
test11 = soup.find_all("table", class_= "tableFile2")
for link in test11:
row = link.find_all("td", nowrap = "nowrap")
for i in range(0, len(row), 3):
filing_type.append(row[i].getText())
date.append(link.find("td", class_ = "small").find_next_sibling("td").text)
name.append(company_name[item])
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame(data)
outputdf.to_csv('Downloads/t_10KLinks.csv')
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame.from_dict(data, orient='index')
outputdf.to_csv('Downloads/t_10KLinks.csv')
pandas.DataFrame.from_dict documentation.

How to get the first column values in the wikipedia table using python bs4?

I'm trying to web scrape a data table in wikipedia using python bs4. But I'm stuck with this problem. When getting the data values my code is not getting the first column or index zero. I feel there something wrong with the index but I can't figure it out. Please help. See the
response_obj = requests.get('https://en.wikipedia.org/wiki/Metro_Manila').text
soup = BeautifulSoup(response_obj,'lxml')
Neighborhoods_MM_Table = soup.find('table', {'class':'wikitable sortable'})
rows = Neighborhoods_MM_Table.select("tbody > tr")[3:8]
cities = []
for row in rows:
city = {}
tds = row.select('td')
city["City or Municipal"] = tds[0].text.strip()
city["%_Population"] = tds[1].text.strip()
city["Population"] = float(tds[2].text.strip().replace(",",""))
city["area_sqkm"] = float(tds[3].text.strip().replace(",",""))
city["area_sqm"] = float(tds[4].text.strip().replace(",",""))
city["density_sqm"] = float(tds[5].text.strip().replace(",",""))
city["density_sqkm"] = float(tds[6].text.strip().replace(",",""))
cities.append(city)
print(cities)
df=pd.DataFrame(cities)
df.head()
import requests
from bs4 import BeautifulSoup
import pandas as pd
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [item.get_text(strip=True) for item in soup.findAll(
"td", style="text-align:right") if "%" in item.text] + [""]
df = pd.read_html(r.content, header=0)[5]
df = df.iloc[1: -1]
df['Population (2015)[3]'] = target
print(df)
df.to_csv("data.csv", index=False)
main("https://en.wikipedia.org/wiki/Metro_Manila")
Output: view-online

Google Play Crawler results - save to CSV

I made a simple crawler that extracts CSV file with google play packages like com.viber.voip and go to the full link like https://play.google.com/store/apps/details?id=com.viber.voip&hl=en.
Then it crawles the title, publisher, downloads etc and stores into a list.
The problem is, when Im trying to save the results into CSV file it throws me an error if Im exporting using pandas to_csv. Or throws UnicodeError when it finds some unknown characters. I tried to add .encode or .decode but it doesnt help. Can someone assist please?
import bs4 as bs
import urllib.request
import pandas as pd
import csv
def searcher(bundles):
html = urllib.request.urlopen(base_url+bundles+post_url).read()
soup = bs.BeautifulSoup(html, 'html.parser')
title_app = soup.title.get_text()
publisher_name = soup.find('a', {'class':'document-subtitle primary'}).get_text()
category = soup.find('a', {'class':'document-subtitle category'}).get_text()
ratings = soup.find('meta', {'itemprop':'ratingValue'}).get('content')
reviews = soup.find('span', {'class':'reviews-num'}).get_text()
downloads = soup.find('div', {'itemprop':'numDownloads'}).get_text()
updated_last_time = soup.find('div', {'class':'content'}).get_text()
text = (bundles, title_app, publisher_name, category, ratings, reviews, downloads, updated_last_time)
return (text)
def store(crawled_data):
writer = csv.writer(f)
labels = ['bundles', 'title_app', 'publisher_name', 'category', 'ratings', 'reviews', 'downloads', 'updated_last_time']
writer.writerow(labels)
df = pd.DataFrame(crawled_data)
for row in df:
if row != None:
writer.writerow(row)
base_url = 'https://play.google.com/store/apps/details?id='
post_url = '&hl=en'
crawled_data = []
crawled_packages = 0
with open('links.csv', 'r') as f:
df = pd.read_csv(f)
urls = df['URLs']
for bundles in urls:
if bundles != None:
aaa = searcher(bundles)
print(crawled_packages)
crawled_packages += 1
if crawled_data != None:
crawled_data.append(aaa)
store(crawled_data)
You can use to_csv() by specifying an output file to use. Also specify your column names whilst building the dataframe:
import bs4 as bs
import urllib.request
import pandas as pd
import csv
def searcher(bundles):
html = urllib.request.urlopen(base_url+bundles+post_url).read()
soup = bs.BeautifulSoup(html, 'html.parser')
title_app = soup.title.get_text()
publisher_name = soup.find('a', {'class':'document-subtitle primary'}).get_text(strip=True)
category = soup.find('a', {'class':'document-subtitle category'}).get_text(strip=True)
ratings = soup.find('meta', {'itemprop':'ratingValue'}).get('content')
reviews = soup.find('span', {'class':'reviews-num'}).get_text(strip=True)
downloads = soup.find('div', {'itemprop':'numDownloads'}).get_text(strip=True)
updated_last_time = soup.find('div', {'class':'content'}).get_text(strip=True)
return (bundles, title_app, publisher_name, category, ratings, reviews, downloads, updated_last_time)
def store(crawled_data):
labels = ['bundles', 'title_app', 'publisher_name', 'category', 'ratings', 'reviews', 'downloads', 'updated_last_time']
df = pd.DataFrame(crawled_data, columns=labels)
df.to_csv('output.csv', index=False)
base_url = 'https://play.google.com/store/apps/details?id='
post_url = '&hl=en'
crawled_data = []
crawled_packages = 0
with open('links.csv', 'r') as f:
df = pd.read_csv(f)
urls = df['URLs']
for bundles in urls:
if bundles != None:
aaa = searcher(bundles)
print(crawled_packages)
crawled_packages += 1
if crawled_data != None:
crawled_data.append(aaa)
store(crawled_data)
This will give you an output.csv file containing:
bundles,title_app,publisher_name,category,ratings,reviews,downloads,updated_last_time
com.viber.voip,Viber Messenger - Android Apps on Google Play,Viber Media S.à r.l.,Communication,4.336112022399902,"11,016,404","500,000,000 - 1,000,000,000","March 15, 2018"

Categories

Resources