I made a simple crawler that extracts CSV file with google play packages like com.viber.voip and go to the full link like https://play.google.com/store/apps/details?id=com.viber.voip&hl=en.
Then it crawles the title, publisher, downloads etc and stores into a list.
The problem is, when Im trying to save the results into CSV file it throws me an error if Im exporting using pandas to_csv. Or throws UnicodeError when it finds some unknown characters. I tried to add .encode or .decode but it doesnt help. Can someone assist please?
import bs4 as bs
import urllib.request
import pandas as pd
import csv
def searcher(bundles):
html = urllib.request.urlopen(base_url+bundles+post_url).read()
soup = bs.BeautifulSoup(html, 'html.parser')
title_app = soup.title.get_text()
publisher_name = soup.find('a', {'class':'document-subtitle primary'}).get_text()
category = soup.find('a', {'class':'document-subtitle category'}).get_text()
ratings = soup.find('meta', {'itemprop':'ratingValue'}).get('content')
reviews = soup.find('span', {'class':'reviews-num'}).get_text()
downloads = soup.find('div', {'itemprop':'numDownloads'}).get_text()
updated_last_time = soup.find('div', {'class':'content'}).get_text()
text = (bundles, title_app, publisher_name, category, ratings, reviews, downloads, updated_last_time)
return (text)
def store(crawled_data):
writer = csv.writer(f)
labels = ['bundles', 'title_app', 'publisher_name', 'category', 'ratings', 'reviews', 'downloads', 'updated_last_time']
writer.writerow(labels)
df = pd.DataFrame(crawled_data)
for row in df:
if row != None:
writer.writerow(row)
base_url = 'https://play.google.com/store/apps/details?id='
post_url = '&hl=en'
crawled_data = []
crawled_packages = 0
with open('links.csv', 'r') as f:
df = pd.read_csv(f)
urls = df['URLs']
for bundles in urls:
if bundles != None:
aaa = searcher(bundles)
print(crawled_packages)
crawled_packages += 1
if crawled_data != None:
crawled_data.append(aaa)
store(crawled_data)
You can use to_csv() by specifying an output file to use. Also specify your column names whilst building the dataframe:
import bs4 as bs
import urllib.request
import pandas as pd
import csv
def searcher(bundles):
html = urllib.request.urlopen(base_url+bundles+post_url).read()
soup = bs.BeautifulSoup(html, 'html.parser')
title_app = soup.title.get_text()
publisher_name = soup.find('a', {'class':'document-subtitle primary'}).get_text(strip=True)
category = soup.find('a', {'class':'document-subtitle category'}).get_text(strip=True)
ratings = soup.find('meta', {'itemprop':'ratingValue'}).get('content')
reviews = soup.find('span', {'class':'reviews-num'}).get_text(strip=True)
downloads = soup.find('div', {'itemprop':'numDownloads'}).get_text(strip=True)
updated_last_time = soup.find('div', {'class':'content'}).get_text(strip=True)
return (bundles, title_app, publisher_name, category, ratings, reviews, downloads, updated_last_time)
def store(crawled_data):
labels = ['bundles', 'title_app', 'publisher_name', 'category', 'ratings', 'reviews', 'downloads', 'updated_last_time']
df = pd.DataFrame(crawled_data, columns=labels)
df.to_csv('output.csv', index=False)
base_url = 'https://play.google.com/store/apps/details?id='
post_url = '&hl=en'
crawled_data = []
crawled_packages = 0
with open('links.csv', 'r') as f:
df = pd.read_csv(f)
urls = df['URLs']
for bundles in urls:
if bundles != None:
aaa = searcher(bundles)
print(crawled_packages)
crawled_packages += 1
if crawled_data != None:
crawled_data.append(aaa)
store(crawled_data)
This will give you an output.csv file containing:
bundles,title_app,publisher_name,category,ratings,reviews,downloads,updated_last_time
com.viber.voip,Viber Messenger - Android Apps on Google Play,Viber Media S.à r.l.,Communication,4.336112022399902,"11,016,404","500,000,000 - 1,000,000,000","March 15, 2018"
Related
I want to scrape the ICC cricket website and find the rankings of batsmen on a particular date. The problem is that the result that I'm getting is the same for all the dates. The scraping is giving me the list of the most recent rankings rather than the ranking at a particular date. The code is given below. Can someone tell me why this is happening/ a solution for the same?
I feel that the problem is that beautifulsoup is not letting the page load completely which is in turn giving false information as the information required is received after applying filters on the website.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import datetime
import os
date_list = pd.date_range(start = "1971-02-01", end=datetime.date.today(), freq='1d')
def get_batsmen(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_bowler(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_allrounder(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/all-rounder?at={date}'
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
#Storing the data into multiple csvs
for date in date_list:
year = date.year
month = date.month
day = date.day
newpath = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}'
if not os.path.exists(newpath):
os.makedirs(newpath)
newpath1 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}'
if not os.path.exists(newpath1):
os.makedirs(newpath1)
newpath2 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}\{day}'
if not os.path.exists(newpath2):
os.makedirs(newpath2)
get_batsmen(date).to_csv(newpath2+'/batsmen.csv')
get_bowler(date).to_csv(newpath2+'/bowler.csv')
get_allrounder(date).to_csv(newpath2+'/allrounder.csv')
In case the website that you're scraping is interactive it can be good to look at Selenium as scraping package instead of bs4 so javascript is executed.
I am new to python. I am using it in a jupyter notebooks to scrape a table from Wikipedia. All the code I wrote works, except when I want to put the information into a csv file. The error that appears is "Index list index out of range".
Here is the code:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
s = requests.Session()
response = s.get(url, timeout=10)
response
table_id = 'main'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify().encode('UTF-8'))
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr'):
print(row)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
print(col[0].find('a').contents[0])
print(col[1].string) #name
print(col[2].string)
print(col[3].string)
print(col[4].string)
print(col[5].find(text=True))
csvfile = open('population.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter=',')
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
csvwriter.writerow(headers)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
country = col[0].find('a').contents[0]
continent = col[1].string
subregion = col[2].string
population_2018 = col[3].string
population_2019 = col[4].string
change = col[5].find(text=True)
parsed_row = (country, continent, subregion, population_2018, population_2019, change)
csvwriter.writerow(parsed_row)
csvfile.close()
Thank you very much!
I have two part answers. The easiest way to accomplish your task and where in your code the error is.
Let pandas handle the requests, BeautifulSoup and csv for you.
import pandas as pd
URI = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
df = pd.read_html(URI)[3]
df.to_csv('population.csv', index=False)
pandas has .read_html that returns a list of all tables in the webpage. Your table was at index 3. With that, I saved it with .to_csv.
With .read_html, you can pass the attributes of a specific table e.g. attrs = {'id': 'table'}
# the table is now at index 0
df = pd.read_html(URI, attrs={'id':'main'})[0]
You can also specify the parser that will be used by BeautifulSoup that .read_html calls:
df = pd.read_html(URI, attrs={'id':'main'}, flavor='lxml')[0]
# 'lxml' is known for speed. But you can use `html.parser` if `lxml` or `html5lib` are not installed.
See more documentation .read_html
Update: Debugging You’re Code
The error from your code is from empty col. using if conditions solves the problem:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
s = requests.Session()
response = s.get(url, timeout=10)
response
table_id = 'main'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#print(soup.prettify().encode('UTF-8'))
csvfile = open('population.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter=',')
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
csvwriter.writerow(headers)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
# this is all that was missing
if col:
country = col[0].find('a')['title']
continent = col[1].string
subregion = col[2].string
population_2018 = col[3].string
population_2019 = col[4].string
change = col[5].find(text=True)
parsed_row = (country, continent, subregion, population_2018, population_2019, change)
csvwriter.writerow(parsed_row)
csvfile.close()
Prayson W. Daniel has already given the answer, and I offer another way.
import requests
from simplified_scrapy import SimplifiedDoc, utils, req
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
s = requests.Session()
res = s.get(url, timeout=10)
rows = []
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
rows.append(headers)
table_id = 'main'
doc = SimplifiedDoc(res.text)
table = doc.select('table#'+table_id) # Get the table by id.
trs = table.tbody.children.children[1:] # Get all data rows
for tr in trs:
row = [tr[0].a.text] # First col, get first link
row.extend(tr.text[1:]) # Left cols
rows.append(row)
utils.save2csv('test_wiki.csv', rows) # Save data to csv
I was able to loop the web scraping process, but the data collected from the page that comes after replaces the data from the page before. Making the excel contains only the data from the last page. What do I need to do?
from bs4 import BeautifulSoup
import requests
import pandas as pd
print ('all imported successfuly')
for x in range(1, 44):
link = (f'https://www.trustpilot.com/review/birchbox.com?page={x}')
print (link)
req = requests.get(link)
content = req.content
soup = BeautifulSoup(content, "lxml")
names = soup.find_all('div', attrs={'class': 'consumer-information__name'})
headers = soup.find_all('h2', attrs={'class':'review-content__title'})
bodies = soup.find_all('p', attrs={'class':'review-content__text'})
ratings = soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})
dates = soup.find_all('div', attrs={'class':'review-content-header__dates'})
print ('pass1')
df = pd.DataFrame({'User Name': names, 'Header': headers, 'Body': bodies, 'Rating': ratings, 'Date': dates})
df.to_csv('birchbox006.csv', index=False, encoding='utf-8')
print ('excel done')
Because you are using a loop the variables are being constantly overwritten. Normally what you'd do in a situation like this is have an array and then append to it throughout the loop:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
print ('all imported successfuly')
# Initialize an empty dataframe
df = pd.DataFrame()
for x in range(1, 44):
names = []
headers = []
bodies = []
ratings = []
published = []
updated = []
reported = []
link = (f'https://www.trustpilot.com/review/birchbox.com?page={x}')
print (link)
req = requests.get(link)
content = req.content
soup = BeautifulSoup(content, "lxml")
articles = soup.find_all('article', {'class':'review'})
for article in articles:
names.append(article.find('div', attrs={'class': 'consumer-information__name'}).text.strip())
headers.append(article.find('h2', attrs={'class':'review-content__title'}).text.strip())
try:
bodies.append(article.find('p', attrs={'class':'review-content__text'}).text.strip())
except:
bodies.append('')
try:
ratings.append(article.find('p', attrs={'class':'review-content__text'}).text.strip())
except:
ratings.append('')
dateElements = article.find('div', attrs={'class':'review-content-header__dates'}).text.strip()
jsonData = json.loads(dateElements)
published.append(jsonData['publishedDate'])
updated.append(jsonData['updatedDate'])
reported.append(jsonData['reportedDate'])
# Create your temporary dataframe of the first iteration, then append that into your "final" dataframe
temp_df = pd.DataFrame({'User Name': names, 'Header': headers, 'Body': bodies, 'Rating': ratings, 'Published Date': published, 'Updated Date':updated, 'Reported Date':reported})
df = df.append(temp_df, sort=False).reset_index(drop=True)
print ('pass1')
df.to_csv('birchbox006.csv', index=False, encoding='utf-8')
print ('excel done')
The reason is because you are overwriting your variables in each iteration.
If you want to extend this variables, you can do for example:
names = []
bodies = []
ratings = []
dates = []
for x in range(1, 44):
link = (f'https://www.trustpilot.com/review/birchbox.com?page={x}')
print (link)
req = requests.get(link)
content = req.content
soup = BeautifulSoup(content, "lxml")
names += soup.find_all('div', attrs={'class': 'consumer-information__name'})
headers += soup.find_all('h2', attrs={'class':'review-content__title'})
bodies += soup.find_all('p', attrs={'class':'review-content__text'})
ratings += soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})
dates += soup.find_all('div', attrs={'class':'review-content-header__dates'})
You'll have to store that data after each iteration somewhere. Theres a few ways you can do it. You can just store everythin in a list, then create your dataframe. Or what I did is create a "temporary" dataframe that is created after each iteration, then append that into the final dataframe. Think of it like bailing water. You have a small bucket of water, to then empty into a large bucket, that will collect/hold all the water you are trying to collect.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
print ('all imported successfuly')
# Initialize an empty dataframe
df = pd.DataFrame()
for x in range(1, 44):
published = []
updated = []
reported = []
link = (f'https://www.trustpilot.com/review/birchbox.com?page={x}')
print (link)
req = requests.get(link)
content = req.content
soup = BeautifulSoup(content, "lxml")
names = [ x.text.strip() for x in soup.find_all('div', attrs={'class': 'consumer-information__name'})]
headers = [ x.text.strip() for x in soup.find_all('h2', attrs={'class':'review-content__title'})]
bodies = [ x.text.strip() for x in soup.find_all('p', attrs={'class':'review-content__text'})]
ratings = [ x.text.strip() for x in soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})]
dateElements = soup.find_all('div', attrs={'class':'review-content-header__dates'})
for date in dateElements:
jsonData = json.loads(date.text.strip())
published.append(jsonData['publishedDate'])
updated.append(jsonData['updatedDate'])
reported.append(jsonData['reportedDate'])
# Create your temporary dataframe of the first iteration, then append that into your "final" dataframe
temp_df = pd.DataFrame({'User Name': names, 'Header': headers, 'Body': bodies, 'Rating': ratings, 'Published Date': published, 'Updated Date':updated, 'Reported Date':reported})
df = df.append(temp_df, sort=False).reset_index(drop=True)
print ('pass1')
df.to_csv('birchbox006.csv', index=False, encoding='utf-8')
print ('excel done')
I am trying to scrape from the first page to page 14 of this website: https://cross-currents.berkeley.edu/archives?author=&title=&type=All&issue=All®ion=All
Here is my code:
import requests as r
from bs4 import BeautifulSoup as soup
import pandas
#make a list of all web pages' urls
webpages=[]
for i in range(15):
root_url = 'https://cross-currents.berkeley.edu/archives?author=&title=&type=All&issue=All®ion=All&page='+ str(i)
webpages.append(root_url)
print(webpages)
#start looping through all pages
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
page_soup = soup(data.text, 'html.parser')
#find targeted info and put them into a list to be exported to a csv file via pandas
title_list = [title.text for title in page_soup.find_all('div', {'class':'field field-name-node-title'})]
title = [el.replace('\n', '') for el in title_list]
#export to csv file via pandas
dataset = {'Title': title}
df = pandas.DataFrame(dataset)
df.index.name = 'ArticleID'
df.to_csv('example31.csv',encoding="utf-8")
The output csv file only contains targeted info of the last page. When I print "webpages", it shows that all the pages' urls have been properly put into the list. What am I doing wrong? Thank you in advance!
You are simply overwriting the same output CSV file for all the pages, you can call .to_csv() in the "append" mode to have the new data added to the end of the existing file:
df.to_csv('example31.csv', mode='a', encoding="utf-8", header=False)
Or, even better would be to collect the titles into a list of titles and then dump into a CSV once:
#start looping through all pages
titles = []
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
page_soup = soup(data.text, 'html.parser')
#find targeted info and put them into a list to be exported to a csv file via pandas
title_list = [title.text for title in page_soup.find_all('div', {'class':'field field-name-node-title'})]
titles += [el.replace('\n', '') for el in title_list]
# export to csv file via pandas
dataset = [{'Title': title} for title in titles]
df = pandas.DataFrame(dataset)
df.index.name = 'ArticleID'
df.to_csv('example31.csv', encoding="utf-8")
Another way in addition to what alexce posted would be to keep appending the dataframe inside to a new dataframe and then write that to the CSV.
Declare finalDf as a dataframe outside the loops:
finalDf = pandas.DataFrame()
Later do this:
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
page_soup = soup(data.text, 'html.parser')
#find targeted info and put them into lists to be exported to a csv file via pandas
title_list = [title.text for title in page_soup.find_all('div', {'class':'field field-name-node-title'})]
title = [el.replace('\n', '') for el in title_list]
#export to csv file via pandas
dataset = {'Title': title}
df = pandas.DataFrame(dataset)
finalDf = finalDf.append(df)
#df.index.name = 'ArticleID'
#df.to_csv('example31.csv', mode='a', encoding="utf-8", header=False)
finalDf = finalDf.reset_index(drop = True)
finalDf.index.name = 'ArticleID'
finalDf.to_csv('example31.csv', encoding="utf-8")
Notice the lines with finalDf
The code can't scrape web data page by page successfully and the csv format doesn't match the web data record. I want to the code enable run all web pages automatically. Right now, it only can run first page data. How it can run second, third page by itself? Secondly, in csv format, 'hospital_name','name','license_type' columns are all empty in csv format. They all show up at the end of csv format
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd
url = "https://www.abvma.ca/client/roster/clientRosterView.html?
clientRosterId=168"
url_page_2 = url + '&page=' + str(2)
def get_data_from_url(url):
#output the data
data = requests.get(url)
page_data = soup(data.text,'html.parser')
AB_data = page_data.find_all('div',{"class":"col-md-4 roster_tbl"})
#create a table
#for each in AB_data:
#print (each.text)
df=pd.DataFrame(AB_data)
df.head()
df.drop([0,1,2,9,3,4,5,6,7,8,10,12],axis=1, inplace=True)
for each in AB_data:
hospital = each.find('a').text
name = each.find('strong').text
license_type=each.find('font').text
#print(hospital)
#df['hospital_name']= hospital
df=df.append(pd.DataFrame({'hospital_name':hospital,
'name':name,'license_type':license_type},index=[0]), sort=False)
pd.set_option('display.max_columns',None)
print (df)
df.to_csv('AB_Vets_2018.csv',index=False)
* Python 3 *
import csv
import requests
from bs4 import BeautifulSoup
FIELDNAMES = (
'first_name',
'last_name',
'license_type',
'location',
'reg_num'
)
def get_page(page_num):
base_url = "https://www.abvma.ca/client/roster/clientRosterView.html"
params = {
'clientRosterId': 168,
'page': page_num
}
r = requests.get(base_url, params=params)
r.raise_for_status()
return r.text
def parse_page(page_html):
result = []
soup = BeautifulSoup(page_html, 'lxml')
for vet in soup.find_all('div', class_='col-md-4 roster_tbl'):
name, *location, title, licence_type, reg_num = vet.stripped_strings
last_name, first_name = name.split(', ', maxsplit=1)
result.append({
'first_name': first_name,
'last_name': last_name,
'license_type': licence_type,
'location': '' if not location else location[0],
'reg_num': int(reg_num.split()[-1])
})
return result
if __name__ == '__main__':
result = []
for page_num in range(1, 35):
page_html = get_page(page_num)
parsed_page = parse_page(page_html)
result.extend(parsed_page)
with open('output.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
writer.writeheader()
writer.writerows(result)
Output CSV