How to scrape data through the table? - python
import requests
from bs4 import BeautifulSoup
import pandas as pd
res = requests.get("https://www.worldometers.info/coronavirus/#countries")
soup = BeautifulSoup(res.text, "html.parser")
table = soup.find("table", {"id":"main_table_countries_today"})
columns = [i.get_text(strip=True) for i in table.find("thead").find_all("th")]
rows = []
for row in table.find("tbody").find_all("tr"):
rows.append([i.get_text(strip=True) for i in row.find_all("td")])
df = pd.DataFrame(rows, columns=columns)
df.to_csv("data.csv", index=False)
print(df)
Output:
# Country,Other ... 1 Deathevery X ppl 1 Testevery X ppl
0 North America ...
1 South America ...
2 Asia ...
3 Europe ...
4 Africa ...
.. ... ... ... ... ...
218 211 St. Barth ... 8
219 212 British Virgin Islands ... 30,249 24
220 213 Saint Pierre Miquelon ...
221 214 Anguilla ... 40
222 215 China ... 310,601 16
[223 rows x 19 columns]
I changed to the above but why only part of the data are shown instead of the table? And how can I indicate the columns by using index? Because I would like to select the five columns to store data 'Country','Total Cases','Total Deaths','Total Recover' and 'Population'
import requests
from bs4 import BeautifulSoup
import pandas as pd
res = requests.get("https://www.worldometers.info/coronavirus/#countries")
soup = BeautifulSoup(res.text, "html.parser")
table = soup.find("table", {"id":"main_table_countries_today"})
columns = [i.get_text(strip=True) for i in table.find("thead").find_all("th")]
rows = []
for row in table.find("tbody").find_all("tr"):
rows.append([i.get_text(strip=True) for i in row.find_all("td")])
df = pd.DataFrame(rows, columns=columns)
df.to_csv("data.csv", index=False)
print(df)
Output:
#,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl
,North America,"5,657,552","+5,378","222,196",+295,"2,919,610","+4,662","2,515,746","26,013",,,,,,North America,,,
,South America,"4,245,834","+1,360","146,906",+89,"2,851,587",+188,"1,247,341","14,300",,,,,,South America,,,
,Asia,"4,453,650","+3,721","99,365",+41,"3,301,717","+5,326","1,052,568","19,086",,,,,,Asia,,,
,Europe,"2,898,953",+456,"203,794",,"1,748,496",+41,"946,663","5,143",,,,,,Europe,,,
,Africa,"961,388",,"20,350",,"615,346",+2,"325,692","1,150",,,,,,Africa,,,
,Oceania,"20,106",+397,246,+13,"12,276",+202,"7,584",43,,,,,,Australia/Oceania,,,
,,721,,15,,651,,55,4,,,,,,,,,
,World,"18,238,204","+11,312","692,872",+438,"11,449,683","+10,421","6,095,649","65,739","2,340",88.9,,,,All,,,
1,USA,"4,813,647",,"158,365",,"2,380,217",,"2,275,065","18,623","14,535",478,"59,935,508","180,977","331,176,957",North America,69,"2,091",6
2,Brazil,"2,733,677",,"94,130",,"1,884,051",,"755,496","8,318","12,853",443,"13,096,132","61,573","212,694,204",South America,78,"2,260",16
3,India,"1,805,838","+1,136","38,176",+15,"1,188,389","+1,161","579,273","8,944","1,307",28,"20,202,858","14,627","1,381,196,835",Asia,765,"36,180",68
4,Russia,"850,870",,"14,128",,"650,173",,"186,569","2,300","5,830",97,"28,793,260","197,295","145,940,242",Europe,172,"10,330",5
5,South Africa,"511,485",,"8,366",,"347,227",,"155,892",539,"8,615",141,"3,036,779","51,147","59,373,395",Africa,116,"7,097",20
...
...
...
Related
How can scrape the team names and goals from this site into a table? Ive been trying a few different methods but can't quite figure it out
import requests from bs4 import BeautifulSoup URL = "https://www.hockey-reference.com/leagues/NHL_2021_games.html" page = requests.get(URL) soup = BeautifulSoup(page.content, "html.parser") results = soup.find(id="all_games") table = soup.find('div', attrs = {'id':'div_games'}) print(table.prettify())
Select the table not the div to print the table: table = soup.find('table', attrs = {'id':'games'}) print(table.prettify()) Or use pandas.read_html() to get the table and transform into a dataframe: import pandas as pd pd.read_html('https://www.hockey-reference.com/leagues/NHL_2021_games.html', attrs={'id':'games'})[0].iloc[:,:5] Output: Date Visitor G Home G.1 2021-01-13 St. Louis Blues 4 Colorado Avalanche 1 2021-01-13 Vancouver Canucks 5 Edmonton Oilers 3 2021-01-13 Pittsburgh Penguins 3 Philadelphia Flyers 6 2021-01-13 Chicago Blackhawks 1 Tampa Bay Lightning 5 2021-01-13 Montreal Canadiens 4 Toronto Maple Leafs 5 ... ... ... ... ...
table = soup.find('div', attrs = {'id':'div_games'}) trs = table.find_all('tr') gamestats = [] for tr in trs: gamestat = {} gamestat['home_team_name'] = tr.find('td', attrs = {'data-stat' : 'home_team_name'}) gamestat['visit_team_name'] = tr.find('td', attrs = {'data-stat' : 'visit_team_name'}) gamestats.append(gamestat)
Web-scraping. Columns instead of rows
I have difficulties scraping data and saving it to a consistent columns. More specifically, the website I scrape does not have a separate tags for each and every item I scrape (except key and value). As a result, I get a CSV file with 2 rows - key and value and corresponding text in them, whereas my idea is to get columns instead. Is it possible to keep headers constant and append value items or it is not possible, given the specifics of the website? Thank you in advance. import requests import bs4 import pandas as pd keys = [] values = [] for pagenumber in range (0,2): url = 'https://www.marktplaats.nl/l/auto-s/p/' txt = requests.get(url+str(pagenumber)) soup = bs4.BeautifulSoup(txt.text, 'html.parser') soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view') for car in soup_table.findAll('li'): link = car.find('a') sub_url = 'https://www.marktplaats.nl/' + link.get('href') sub_soup = requests.get(sub_url) soup1 = bs4.BeautifulSoup(sub_soup.text, 'html.parser') soup1 = soup1.find('div', {'id': 'car-attributes'}) for car_item in soup1.findAll('div', {'class': 'spec-table-item'}): key = car_item.find('span', {'class': 'key'}).text keys.append(key) value = car_item.find('span', {'class': 'value'}).text values.append(value) auto_database = pd.DataFrame({ 'key': keys, 'value': values, }) auto_database.to_csv('auto_database.csv') print("Successfully saved..") Results Merk & Model: Lako Bouwjaar: 1996 Uitvoering: 233 C Carrosserie: Open wagen Kenteken: OD-31-VD APK tot: 29 juni 2020 Prijs: € 7.500,00 Merk & Model: RAM Bouwjaar: 2020 Carrosserie: SUV of Terreinwagen Brandstof: LPG Kilometerstand: 70 km Transmissie: Automaat Prijs: Zie omschrijving Motorinhoud: 5.700 cc Opties: Wanted result Merk & Model Bouwjaar RAM 2020
I suggest to save all metadata per car item to a dataframe, set the keys as the index and join all intermediate dataframes to a final one. Try this: import requests import bs4 import pandas as pd frames = [] for pagenumber in range (0,2): url = 'https://www.marktplaats.nl/l/auto-s/p/' txt = requests.get(url+str(pagenumber)) soup = bs4.BeautifulSoup(txt.text, 'html.parser') soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view') for car in soup_table.findAll('li'): link = car.find('a') sub_url = 'https://www.marktplaats.nl/' + link.get('href') sub_soup = requests.get(sub_url) soup1 = bs4.BeautifulSoup(sub_soup.text, 'html.parser') soup1 = soup1.find('div', {'id': 'car-attributes'}) tmp = [] for car_item in soup1.findAll('div', {'class': 'spec-table-item'}): key = car_item.find('span', {'class': 'key'}).text value = car_item.find('span', {'class': 'value'}).text tmp.append([key, value]) frames.append(pd.DataFrame(tmp).set_index(0)) df_final = pd.concat((tmp_df for tmp_df in frames), axis=1, join='outer').reset_index() df_final = df_final.T df_final.columns = df_final.loc["index"].values df_final.drop("index", inplace=True) df_final.reset_index(inplace=True, drop=True) df_final.to_csv('auto_database.csv') display(df_final.head(3)) Output: Bouwjaar: Brandstof: Kilometerstand: Transmissie: Prijs: Motorinhoud: Kenteken: Opties: Merk & Model: Carrosserie: Uitvoering: APK tot: Energielabel: Verbruik: Topsnelheid: Kosten p/m: Vermogen: APK: Datum van registratie: 0 2014 Diesel 10.000 km Automaat € 10.950,00 400 cc NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1 2011 Diesel 25.000 km Handgeschakeld Op aanvraag 1.500 cc VR-921-X \n\nParkeersensor\nMetallic lak\nBoordcomputer... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2 2016 Benzine 95.545 km Handgeschakeld € 230,00 1.395 cc NaN \n\nParkeersensor\nMetallic lak\nRadio\nMistla... A3 Sedan NaN NaN NaN NaN NaN NaN NaN NaN NaN
Another approach: import requests import bs4 import pandas as pd cars = [] for pagenumber in range (0,2): url = 'https://www.marktplaats.nl/l/auto-s/p/' txt = requests.get(url+str(pagenumber)) soup = bs4.BeautifulSoup(txt.text, 'html.parser') soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view') for car in soup_table.findAll('li'): link = car.find('a') sub_url = 'https://www.marktplaats.nl/' + link.get('href') sub_soup = requests.get(sub_url) soup1 = bs4.BeautifulSoup(sub_soup.text, 'html.parser') soup1 = soup1.find('div', {'id': 'car-attributes'}) attribs = {} for car_item in soup1.findAll('div', {'class': 'spec-table-item'}): key = car_item.find('span', {'class': 'key'}).get_text(strip=True, separator='\n') value = car_item.find('span', {'class': 'value'}).get_text(strip=True, separator='\n') attribs[key] = value cars.append(attribs) unique_keys = set(k for car in cars for k in car.keys()) data = [{k: car.get(k) for k in unique_keys} for car in cars] auto_database = pd.DataFrame(data) auto_database.to_csv('auto_database.csv') print("Successfully saved..") Produces this csv file (screenshot from Libre Office):
How to add conditional row to pandas dataframe
I tried looking for a succinct answer and nothing helped. I am trying to add a row to a dataframe that takes a string for the first column and then for each column grabbing the sum. I ran into a scalar issue, so I tried to make the desired row into a series then convert to a dataframe, but apparently I was adding four rows with one column value instead of one row with the four column values. My code: def country_csv(): # loop through absolute paths of each file in source for filename in os.listdir(source): filepath = os.path.join(source, filename) if not os.path.isfile(filepath): continue df = pd.read_csv(filepath) df = df.groupby(['Country']).sum() df.reset_index() print(df) # df.to_csv(os.path.join(path1, filename)) Sample dataframe: Confirmed Deaths Recovered Country Afghanistan 299 7 10 Albania 333 20 99 Would like to see this as the first row World 632 27 109
import pandas as pd import datetime as dt df Confirmed Deaths Recovered Country Afghanistan 299 7 10 Albania 333 20 99 df.loc['World'] = [df['Confirmed'].sum(),df['Deaths'].sum(),df['Recovered'].sum()] df.sort_values(by=['Confirmed'], ascending=False) Confirmed Deaths Recovered Country World 632 27 109 Albania 333 20 99 Afghanistan 299 7 10
IIUC, you can create a dict then repass it into a dataframe to concat. data = df.sum(axis=0).to_dict() data.update({'Country' : 'World'}) df2 = pd.concat([pd.DataFrame(data,index=[0]).set_index('Country'),df],axis=0) print(df2) Confirmed Deaths Recovered Country World 632 27 109 Afghanistan 299 7 10 Albania 333 20 99 or a oner liner using assign and Transpose df2 = pd.concat( [df.sum(axis=0).to_frame().T.assign(Country="World").set_index("Country"), df], axis=0, ) print(df2) Confirmed Deaths Recovered Country World 632 27 109 Afghanistan 299 7 10 Albania 333 20 99
How to convert list to pandas DataFrame?
I use BeautifulSoup to get some data from a webpage: import pandas as pd import requests from bs4 import BeautifulSoup res = requests.get("http://www.nationmaster.com/country-info/stats/Media/Internet-users") soup = BeautifulSoup(res.content,'html5lib') table = soup.find_all('table')[0] df = pd.read_html(str(table)) df.head() But df is a list, not the pandas DataFrame as I expected from using pd.read_html. How can I get pandas DataFrame out of it?
You can use read_html with your url: df = pd.read_html("http://www.nationmaster.com/country-info/stats/Media/Internet-users")[0] And then if necessary remove GRAPH and HISTORY columns and replace NaNs in column # by forward filling: df = df.drop(['GRAPH','HISTORY'], axis=1) df['#'] = df['#'].ffill() print(df.head()) # COUNTRY AMOUNT DATE 0 1 China 389 million 2009 1 2 United States 245 million 2009 2 3 Japan 99.18 million 2009 3 3 Group of 7 countries (G7) average (profile) 80.32 million 2009 4 4 Brazil 75.98 million 2009 print(df.tail()) # COUNTRY AMOUNT DATE 244 214 Niue 1100 2009 245 =215 Saint Helena, Ascension, and Tristan da Cunha 900 2009 246 =215 Saint Helena 900 2009 247 217 Tokelau 800 2008 248 218 Christmas Island 464 2001
The second row and third row should be a single row
from bs4 import BeautifulSoup import urllib2 from lxml.html import fromstring import re import csv import pandas as pd wiki = "http://en.wikipedia.org/wiki/List_of_Test_cricket_records" header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia req = urllib2.Request(wiki,headers=header) page = urllib2.urlopen(req) soup = BeautifulSoup(page) try: table = soup.find_all('table')[1] except AttributeError as e: print 'No tables found, exiting' try: rows = table.find_all('tr') except AttributeError as e: print 'No table rows found, exiting' try: first = table.find_all('tr')[0] except AttributeError as e: print 'No table row found, exiting' try: allRows = table.find_all('tr')[1:] except AttributeError as e: print 'No table row found, exiting' results = [] firstRow = first.find_all('td') results.append([header.get_text() for header in firstRow]) for row in allRows: table_headers = row.find_all('th') table_data = row.find_all('td') if table_headers : results.append([headers.get_text() for headers in table_headers]) if table_data : results.append([data.get_text() for data in table_data]) df = pd.DataFrame(data = results) df Desired output: Margin Teams Venue Season Innings and 579 runs | England (903-7 d) beat Australia (201 & 123) | The Oval, London | 1938 Innings and 360 runs | Australia (652–7 d) beat South Africa (159 & ..| New Wanderers Stadium, Johannesburg | 2001–02 Innings and 336 runs | West Indies (614–5 d) beat India (124 & 154) | Eden Gardens, Kolkata | 1958–59 Innings and 332 runs | Australia (645) beat England (141 & 172) | Brisbane Cricket Ground | 1946–47 Innings and 324 runs | Pakistan (643) beat New Zealand (73 & 246) | Gaddafi Stadium, Lahore | 2002
You need to collect both th and td tags: for row in allRows: results.append([data.get_text() for data in row.find_all(['th', 'td'])]) And, don't forget to omit the last row, it has only Last updated: ... text inside: allRows = table.find_all('tr')[1:-1] Additionally, if you want to have column names in your dataframe matching table headers on a page, you need to specify columns keyword argument while creating a dataframe: headers = [header.get_text() for header in first.find_all('td')] results = [[data.get_text() for data in row.find_all(['th', 'td'])] for row in allRows] df = pd.DataFrame(data=results, columns=headers) print(df) Produces: Margin Teams \ 0 Innings and 579 runs England (903-7 d) beat Australia (201 & 123) 1 Innings and 360 runs Australia (652–7 d) beat South Africa (159 & ... 2 Innings and 336 runs West Indies (614–5 d) beat India (124 & 154) 3 Innings and 332 runs Australia (645) beat England (141 & 172) 4 Innings and 324 runs Pakistan (643) beat New Zealand (73 & 246) Venue Season 0 The Oval, London 1938 1 New Wanderers Stadium, Johannesburg 2001–02 2 Eden Gardens, Kolkata 1958–59 3 Brisbane Cricket Ground 1946–47 4 Gaddafi Stadium, Lahore 2002