How to scrape data through the table? - python

import requests
from bs4 import BeautifulSoup
import pandas as pd
res = requests.get("https://www.worldometers.info/coronavirus/#countries")
soup = BeautifulSoup(res.text, "html.parser")
table = soup.find("table", {"id":"main_table_countries_today"})
columns = [i.get_text(strip=True) for i in table.find("thead").find_all("th")]
rows = []
for row in table.find("tbody").find_all("tr"):
rows.append([i.get_text(strip=True) for i in row.find_all("td")])
df = pd.DataFrame(rows, columns=columns)
df.to_csv("data.csv", index=False)
print(df)
Output:
# Country,Other ... 1 Deathevery X ppl 1 Testevery X ppl
0 North America ...
1 South America ...
2 Asia ...
3 Europe ...
4 Africa ...
.. ... ... ... ... ...
218 211 St. Barth ... 8
219 212 British Virgin Islands ... 30,249 24
220 213 Saint Pierre Miquelon ...
221 214 Anguilla ... 40
222 215 China ... 310,601 16
[223 rows x 19 columns]
I changed to the above but why only part of the data are shown instead of the table? And how can I indicate the columns by using index? Because I would like to select the five columns to store data 'Country','Total Cases','Total Deaths','Total Recover' and 'Population'

import requests
from bs4 import BeautifulSoup
import pandas as pd
res = requests.get("https://www.worldometers.info/coronavirus/#countries")
soup = BeautifulSoup(res.text, "html.parser")
table = soup.find("table", {"id":"main_table_countries_today"})
columns = [i.get_text(strip=True) for i in table.find("thead").find_all("th")]
rows = []
for row in table.find("tbody").find_all("tr"):
rows.append([i.get_text(strip=True) for i in row.find_all("td")])
df = pd.DataFrame(rows, columns=columns)
df.to_csv("data.csv", index=False)
print(df)
Output:
#,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl
,North America,"5,657,552","+5,378","222,196",+295,"2,919,610","+4,662","2,515,746","26,013",,,,,,North America,,,
,South America,"4,245,834","+1,360","146,906",+89,"2,851,587",+188,"1,247,341","14,300",,,,,,South America,,,
,Asia,"4,453,650","+3,721","99,365",+41,"3,301,717","+5,326","1,052,568","19,086",,,,,,Asia,,,
,Europe,"2,898,953",+456,"203,794",,"1,748,496",+41,"946,663","5,143",,,,,,Europe,,,
,Africa,"961,388",,"20,350",,"615,346",+2,"325,692","1,150",,,,,,Africa,,,
,Oceania,"20,106",+397,246,+13,"12,276",+202,"7,584",43,,,,,,Australia/Oceania,,,
,,721,,15,,651,,55,4,,,,,,,,,
,World,"18,238,204","+11,312","692,872",+438,"11,449,683","+10,421","6,095,649","65,739","2,340",88.9,,,,All,,,
1,USA,"4,813,647",,"158,365",,"2,380,217",,"2,275,065","18,623","14,535",478,"59,935,508","180,977","331,176,957",North America,69,"2,091",6
2,Brazil,"2,733,677",,"94,130",,"1,884,051",,"755,496","8,318","12,853",443,"13,096,132","61,573","212,694,204",South America,78,"2,260",16
3,India,"1,805,838","+1,136","38,176",+15,"1,188,389","+1,161","579,273","8,944","1,307",28,"20,202,858","14,627","1,381,196,835",Asia,765,"36,180",68
4,Russia,"850,870",,"14,128",,"650,173",,"186,569","2,300","5,830",97,"28,793,260","197,295","145,940,242",Europe,172,"10,330",5
5,South Africa,"511,485",,"8,366",,"347,227",,"155,892",539,"8,615",141,"3,036,779","51,147","59,373,395",Africa,116,"7,097",20
...
...
...

Related

How can scrape the team names and goals from this site into a table? Ive been trying a few different methods but can't quite figure it out

import requests
from bs4 import BeautifulSoup
URL = "https://www.hockey-reference.com/leagues/NHL_2021_games.html"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find(id="all_games")
table = soup.find('div', attrs = {'id':'div_games'})
print(table.prettify())
Select the table not the div to print the table:
table = soup.find('table', attrs = {'id':'games'})
print(table.prettify())
Or use pandas.read_html() to get the table and transform into a dataframe:
import pandas as pd
pd.read_html('https://www.hockey-reference.com/leagues/NHL_2021_games.html', attrs={'id':'games'})[0].iloc[:,:5]
Output:
Date
Visitor
G
Home
G.1
2021-01-13
St. Louis Blues
4
Colorado Avalanche
1
2021-01-13
Vancouver Canucks
5
Edmonton Oilers
3
2021-01-13
Pittsburgh Penguins
3
Philadelphia Flyers
6
2021-01-13
Chicago Blackhawks
1
Tampa Bay Lightning
5
2021-01-13
Montreal Canadiens
4
Toronto Maple Leafs
5
...
...
...
...
...
table = soup.find('div', attrs = {'id':'div_games'})
trs = table.find_all('tr')
gamestats = []
for tr in trs:
gamestat = {}
gamestat['home_team_name'] = tr.find('td', attrs = {'data-stat' : 'home_team_name'})
gamestat['visit_team_name'] = tr.find('td', attrs = {'data-stat' : 'visit_team_name'})
gamestats.append(gamestat)

Web-scraping. Columns instead of rows

I have difficulties scraping data and saving it to a consistent columns.
More specifically, the website I scrape does not have a separate tags for each and every item I scrape (except key and value).
As a result, I get a CSV file with 2 rows - key and value and corresponding text in them, whereas my idea is to get columns instead.
Is it possible to keep headers constant and append value items or it is not possible, given the specifics of the website?
Thank you in advance.
import requests
import bs4
import pandas as pd
keys = []
values = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
txt = requests.get(url+str(pagenumber))
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
soup1 = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
soup1 = soup1.find('div', {'id': 'car-attributes'})
for car_item in soup1.findAll('div', {'class': 'spec-table-item'}):
key = car_item.find('span', {'class': 'key'}).text
keys.append(key)
value = car_item.find('span', {'class': 'value'}).text
values.append(value)
auto_database = pd.DataFrame({
'key': keys,
'value': values,
})
auto_database.to_csv('auto_database.csv')
print("Successfully saved..")
Results
Merk & Model: Lako
Bouwjaar: 1996
Uitvoering: 233 C
Carrosserie: Open wagen
Kenteken: OD-31-VD
APK tot: 29 juni 2020
Prijs: € 7.500,00
Merk & Model: RAM
Bouwjaar: 2020
Carrosserie: SUV of Terreinwagen
Brandstof: LPG
Kilometerstand: 70 km
Transmissie: Automaat
Prijs: Zie omschrijving
Motorinhoud: 5.700 cc
Opties:
Wanted result
Merk & Model Bouwjaar
RAM 2020
I suggest to save all metadata per car item to a dataframe, set the keys as the index and join all intermediate dataframes to a final one.
Try this:
import requests
import bs4
import pandas as pd
frames = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
txt = requests.get(url+str(pagenumber))
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
soup1 = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
soup1 = soup1.find('div', {'id': 'car-attributes'})
tmp = []
for car_item in soup1.findAll('div', {'class': 'spec-table-item'}):
key = car_item.find('span', {'class': 'key'}).text
value = car_item.find('span', {'class': 'value'}).text
tmp.append([key, value])
frames.append(pd.DataFrame(tmp).set_index(0))
df_final = pd.concat((tmp_df for tmp_df in frames), axis=1, join='outer').reset_index()
df_final = df_final.T
df_final.columns = df_final.loc["index"].values
df_final.drop("index", inplace=True)
df_final.reset_index(inplace=True, drop=True)
df_final.to_csv('auto_database.csv')
display(df_final.head(3))
Output:
Bouwjaar: Brandstof: Kilometerstand: Transmissie: Prijs: Motorinhoud: Kenteken: Opties: Merk & Model: Carrosserie: Uitvoering: APK tot: Energielabel: Verbruik: Topsnelheid: Kosten p/m: Vermogen: APK: Datum van registratie:
0 2014 Diesel 10.000 km Automaat € 10.950,00 400 cc NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2011 Diesel 25.000 km Handgeschakeld Op aanvraag 1.500 cc VR-921-X \n\nParkeersensor\nMetallic lak\nBoordcomputer... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 2016 Benzine 95.545 km Handgeschakeld € 230,00 1.395 cc NaN \n\nParkeersensor\nMetallic lak\nRadio\nMistla... A3 Sedan NaN NaN NaN NaN NaN NaN NaN NaN NaN
Another approach:
import requests
import bs4
import pandas as pd
cars = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
txt = requests.get(url+str(pagenumber))
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
soup1 = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
soup1 = soup1.find('div', {'id': 'car-attributes'})
attribs = {}
for car_item in soup1.findAll('div', {'class': 'spec-table-item'}):
key = car_item.find('span', {'class': 'key'}).get_text(strip=True, separator='\n')
value = car_item.find('span', {'class': 'value'}).get_text(strip=True, separator='\n')
attribs[key] = value
cars.append(attribs)
unique_keys = set(k for car in cars for k in car.keys())
data = [{k: car.get(k) for k in unique_keys} for car in cars]
auto_database = pd.DataFrame(data)
auto_database.to_csv('auto_database.csv')
print("Successfully saved..")
Produces this csv file (screenshot from Libre Office):

How to add conditional row to pandas dataframe

I tried looking for a succinct answer and nothing helped. I am trying to add a row to a dataframe that takes a string for the first column and then for each column grabbing the sum. I ran into a scalar issue, so I tried to make the desired row into a series then convert to a dataframe, but apparently I was adding four rows with one column value instead of one row with the four column values.
My code:
def country_csv():
# loop through absolute paths of each file in source
for filename in os.listdir(source):
filepath = os.path.join(source, filename)
if not os.path.isfile(filepath):
continue
df = pd.read_csv(filepath)
df = df.groupby(['Country']).sum()
df.reset_index()
print(df)
# df.to_csv(os.path.join(path1, filename))
Sample dataframe:
Confirmed Deaths Recovered
Country
Afghanistan 299 7 10
Albania 333 20 99
Would like to see this as the first row
World 632 27 109
import pandas as pd
import datetime as dt
df
Confirmed Deaths Recovered
Country
Afghanistan 299 7 10
Albania 333 20 99
df.loc['World'] = [df['Confirmed'].sum(),df['Deaths'].sum(),df['Recovered'].sum()]
df.sort_values(by=['Confirmed'], ascending=False)
Confirmed Deaths Recovered
Country
World 632 27 109
Albania 333 20 99
Afghanistan 299 7 10
IIUC, you can create a dict then repass it into a dataframe to concat.
data = df.sum(axis=0).to_dict()
data.update({'Country' : 'World'})
df2 = pd.concat([pd.DataFrame(data,index=[0]).set_index('Country'),df],axis=0)
print(df2)
Confirmed Deaths Recovered
Country
World 632 27 109
Afghanistan 299 7 10
Albania 333 20 99
or a oner liner using assign and Transpose
df2 = pd.concat(
[df.sum(axis=0).to_frame().T.assign(Country="World").set_index("Country"), df],
axis=0,
)
print(df2)
Confirmed Deaths Recovered
Country
World 632 27 109
Afghanistan 299 7 10
Albania 333 20 99

How to convert list to pandas DataFrame?

I use BeautifulSoup to get some data from a webpage:
import pandas as pd
import requests
from bs4 import BeautifulSoup
res = requests.get("http://www.nationmaster.com/country-info/stats/Media/Internet-users")
soup = BeautifulSoup(res.content,'html5lib')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
df.head()
But df is a list, not the pandas DataFrame as I expected from using pd.read_html.
How can I get pandas DataFrame out of it?
You can use read_html with your url:
df = pd.read_html("http://www.nationmaster.com/country-info/stats/Media/Internet-users")[0]
And then if necessary remove GRAPH and HISTORY columns and replace NaNs in column # by forward filling:
df = df.drop(['GRAPH','HISTORY'], axis=1)
df['#'] = df['#'].ffill()
print(df.head())
# COUNTRY AMOUNT DATE
0 1 China 389 million 2009
1 2 United States 245 million 2009
2 3 Japan 99.18 million 2009
3 3 Group of 7 countries (G7) average (profile) 80.32 million 2009
4 4 Brazil 75.98 million 2009
print(df.tail())
# COUNTRY AMOUNT DATE
244 214 Niue 1100 2009
245 =215 Saint Helena, Ascension, and Tristan da Cunha 900 2009
246 =215 Saint Helena 900 2009
247 217 Tokelau 800 2008
248 218 Christmas Island 464 2001

The second row and third row should be a single row

from bs4 import BeautifulSoup
import urllib2
from lxml.html import fromstring
import re
import csv
import pandas as pd
wiki = "http://en.wikipedia.org/wiki/List_of_Test_cricket_records"
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
try:
table = soup.find_all('table')[1]
except AttributeError as e:
print 'No tables found, exiting'
try:
rows = table.find_all('tr')
except AttributeError as e:
print 'No table rows found, exiting'
try:
first = table.find_all('tr')[0]
except AttributeError as e:
print 'No table row found, exiting'
try:
allRows = table.find_all('tr')[1:]
except AttributeError as e:
print 'No table row found, exiting'
results = []
firstRow = first.find_all('td')
results.append([header.get_text() for header in firstRow])
for row in allRows:
table_headers = row.find_all('th')
table_data = row.find_all('td')
if table_headers :
results.append([headers.get_text() for headers in table_headers])
if table_data :
results.append([data.get_text() for data in table_data])
df = pd.DataFrame(data = results)
df
Desired output:
Margin Teams Venue Season
Innings and 579 runs | England (903-7 d) beat Australia (201 & 123) | The Oval, London | 1938
Innings and 360 runs | Australia (652–7 d) beat South Africa (159 & ..| New Wanderers Stadium, Johannesburg | 2001–02
Innings and 336 runs | West Indies (614–5 d) beat India (124 & 154) | Eden Gardens, Kolkata | 1958–59
Innings and 332 runs | Australia (645) beat England (141 & 172) | Brisbane Cricket Ground | 1946–47
Innings and 324 runs | Pakistan (643) beat New Zealand (73 & 246) | Gaddafi Stadium, Lahore | 2002
You need to collect both th and td tags:
for row in allRows:
results.append([data.get_text() for data in row.find_all(['th', 'td'])])
And, don't forget to omit the last row, it has only Last updated: ... text inside:
allRows = table.find_all('tr')[1:-1]
Additionally, if you want to have column names in your dataframe matching table headers on a page, you need to specify columns keyword argument while creating a dataframe:
headers = [header.get_text() for header in first.find_all('td')]
results = [[data.get_text() for data in row.find_all(['th', 'td'])] for row in allRows]
df = pd.DataFrame(data=results, columns=headers)
print(df)
Produces:
Margin Teams \
0 Innings and 579 runs  England (903-7 d) beat Australia (201 & 123)
1 Innings and 360 runs  Australia (652–7 d) beat South Africa (159 & ...
2 Innings and 336 runs  West Indies (614–5 d) beat India (124 & 154)
3 Innings and 332 runs  Australia (645) beat England (141 & 172)
4 Innings and 324 runs  Pakistan (643) beat New Zealand (73 & 246)
Venue Season
0 The Oval, London 1938
1 New Wanderers Stadium, Johannesburg 2001–02
2 Eden Gardens, Kolkata 1958–59
3 Brisbane Cricket Ground 1946–47
4 Gaddafi Stadium, Lahore 2002

Categories

Resources