create a table using pandas using dataset from wikipedia table - python

I am trying to tabulate data into three columns title, release date and continuity using pandas. i am tryig to fetch my dataset by scraping the data from the Released films section of this wikipedia page and i tried following the steps from this Youtube Video.
here is my code
import requests as r
from bs4 import BeautifulSoup
import pandas as pd
response = r.get("https://en.wikipedia.org/wiki/DC_Universe_Animated_Original_Movies")
wiki_text = response.text
soup = BeautifulSoup(wiki_text, "html.parser")
table_soup = soup.find_all("table")
filtered_table_soup = [table for table in table_soup if table.th is not None]
required_table = None
for table in filtered_table_soup:
if str(table.th.string).strip() == "Release date":
required_table = table
break
print(required_table)
When ever i run the code, it always return None instead of Release date.
I am new to web scrapping by the way, so please go easy on me.
Thank You.

Unless BS4 is a requirement, you can just use panda to fetch all html tables on that page. It will make a DataFrame of each table and store that in an array. You can then loop through the array to find the table of interest.
import pandas as pd
url = r"https://en.wikipedia.org/wiki/DC_Universe_Animated_Original_Movies"
tables = pd.read_html(url) # Returns list of all tables on page
for tab in tables:
if "Release date" in tab.columns:
required_table = tab

It's actually really simple:
The table is the second <table> on the page, so use slicing to get the correct table:
import pandas as pd
URL = "https://en.wikipedia.org/wiki/DC_Universe_Animated_Original_Movies"
df = pd.read_html(URL, header=0)[1]
print(df.to_string())
Prints (truncated)
Title Release date Continuity Adapted from
0 Superman: Doomsday September 21, 2007 Standalone "The Death of Superman"
1 Justice League: The New Frontier February 26, 2008 Standalone DC: The New Frontier
2 Batman: Gotham Knight July 8, 2008 Nolanverse (unofficial)[2] Batman: "The Batman Nobody Knows"
3 Wonder Woman March 3, 2009 Standalone Wonder Woman: "Gods and Mortals"
4 Green Lantern: First Flight July 28, 2009 Standalone NaN
5 Superman/Batman: Public Enemies September 29, 2009 Superman/Batman[3] Superman/Batman: "Public Enemies"
6 Justice League: Crisis on Two Earths February 23, 2010 Crisis on Two Earths / Doom "Crisis on Earth-Three!" / JLA: Earth 2
7 Batman: Under the Red Hood July 7, 2010 Standalone Batman: "Under the Hood"
8
Or, if you want to specifically use BeautifulSoup, you can use a CSS selector to select the second table:
import requests
import pandas as pd
from bs4 import BeautifulSoup
URL = "https://en.wikipedia.org/wiki/DC_Universe_Animated_Original_Movies"
soup = BeautifulSoup(requests.get(URL).text, "html.parser")
# find the second table
table = soup.select_one("table:nth-of-type(2)")
df = pd.read_html(str(table))[0]
print(df.to_string())

Try:
import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/wiki/DC_Universe_Animated_Original_Movies'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
table = soup.select_one('h2:has(#Released_films) + table')
header = [th.text.strip() for th in table.select('th')]
data = []
for row in table.select('tr:has(td)'):
tds = [td.text.strip() for td in row.select('td')]
data.append(tds)
print(('{:<45}'*4).format(*header))
print('-' * (45*4))
for row in data:
print(('{:<45}'*len(row)).format(*row))
Prints:
Title Release date Continuity Adapted from
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Superman: Doomsday September 21, 2007 Standalone "The Death of Superman"
Justice League: The New Frontier February 26, 2008 Standalone DC: The New Frontier
Batman: Gotham Knight July 8, 2008 Nolanverse (unofficial)[2] Batman: "The Batman Nobody Knows"
Wonder Woman March 3, 2009 Standalone Wonder Woman: "Gods and Mortals"
Green Lantern: First Flight July 28, 2009 Standalone
Superman/Batman: Public Enemies September 29, 2009 Superman/Batman[3] Superman/Batman: "Public Enemies"
Justice League: Crisis on Two Earths February 23, 2010 Crisis on Two Earths / Doom "Crisis on Earth-Three!" / JLA: Earth 2
Batman: Under the Red Hood July 7, 2010 Standalone Batman: "Under the Hood"
Superman/Batman: Apocalypse September 28, 2010 Superman/Batman[3] Superman/Batman: "The Supergirl from Krypton"
All-Star Superman February 22, 2011 Standalone All-Star Superman
Green Lantern: Emerald Knights July 7, 2011 Standalone "New Blood" / "What Price Honor?" / "Mogo Doesn't Socialize" / "Tygers"
Batman: Year One October 18, 2011 Year One / Dark Knight Returns[4][5] Batman: Year One
Justice League: Doom February 28, 2012 Crisis on Two Earths / Doom JLA: "Tower of Babel"
Superman vs. The Elite June 12, 2012 Standalone "What's So Funny About Truth, Justice & the American Way?"
...and so on.

Related

Unable to extract table from the ESPN website

I am trying to extract a table from the ESPN website using the code below, however I am unable to extract the whole table and paste the whole table in the new csv file. I am getting error as AttributeError: 'list' object has no attribute 'to_csv'
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import requests
service = Service(r"C:\Users\Sai Ram\Documents\Python\Driver\chromedriver.exe")
def get_driver():
options = webdriver.ChromeOptions()
options.add_argument("disable-infobars")
options.add_argument("start-maximized")
options.add_argument("disable-dev-shm-usage")
options.add_argument("no-sandbox")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_argument("disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(service = service, options = options)
driver.get("https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?class=3;id=2022;type=year")
return driver
def main():
get_driver()
url = "https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?class=3;id=2022;type=year"
html = requests.get(url).content
df_list = pd.read_html(html)
df=df_list
df.to_csv(r'C:\Users\Sai Ram\Documents\Power BI\End to End T-20 World Cup DashBoard\CSV Data\scappying.csv')
main()
Could you please help in resolving this issue and if possible suggest me the best way to extract data using web scrapping code in python.
You can simply do this:
import pandas as pd
url = "https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?class=3;id=2022;type=year"
df = pd.read_html(url)[0] # <- this index is important
df.head()
Output
Team 1 Team 2 Winner Margin Ground Match Date Scorecard
0 West Indies England West Indies 9 wickets Bridgetown Jan 22, 2022 T20I # 1453
1 West Indies England England 1 run Bridgetown Jan 23, 2022 T20I # 1454
2 West Indies England West Indies 20 runs Bridgetown Jan 26, 2022 T20I # 1455
3 West Indies England England 34 runs Bridgetown Jan 29, 2022 T20I # 1456
4 West Indies England West Indies 17 runs Bridgetown Jan 30, 2022 T20I # 1457
To save your dataframe as a CSV file:
df.to_csv("your_data_file.csv")
By loading the webpage into pandas you get 2 dataframes (in this case for this particular site). The first dataframe is the one you want (at least I understand this from you post) with the large table of results. The second one is a smaller list. So by indexing the table with [0] you retrieve the first dataframe.
You get the attribute error because you try to treat the list of (2) dataframes as a dataframe.

Appending elements of a list into a multi-dimensional list

Hi I'm doing some web scraping with NBA Data in python on this page. Some elements of basketball-reference are easy to scrape, but this one is giving me some trouble with my lack of python knowledge.
I'm able to grab the data and column headers I want, but I end up with 2 lists of data that I need to combine by their index (i think?) so that index 0 of player_injury_info lines up with index 0 of player_names etc, which I dont know how to do.
Below I've pasted some code that you can follow along.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timezone, timedelta
url = "https://www.basketball-reference.com/friv/injuries.fcgi"
html = urlopen(url)
soup = BeautifulSoup(html)
# this correctly gives me the 4 column headers i want (Player, Team, Update, Description)
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
# 2 lists - player_injury_info and player_names. they need to be combined.
rows = soup.findAll('tr')
player_injury_info = [[td.getText() for td in rows[i].findAll('td')]
for i in range(len(rows))]
player_injury_info = player_injury_info[1:] # removing first element bc dont need it
player_names = [[th.getText() for th in rows[i].findAll('th')]
for i in range(len(rows))]
player_names = player_names[1:] # removing first element bc dont need it
### joining the lists in the correct order- the part i dont know how to do
player_list = player_names.append(player_injury_info)
### this should give me the data frame i want if i can get player_injury_info into the right format.
injury_data = pd.DataFrame(player_injury_info, columns = headers)
There might be an easier way to web scrape the data into all 1 list / data frame? Or maybe it's fine to just join the 2 lists together like I'm trying to do. But if anybody was able to follow along and can offer a solution I'd appreciate the help!
Let pandas do the parse of the table for you.
import pandas as pd
url = "https://www.basketball-reference.com/friv/injuries.fcgi"
injury_data = pd.read_html(url)[0]
Output:
print(injury_data)
Player ... Description
0 Onyeka Okongwu ... Out (Shoulder) - The Hawks announced that Okon...
1 Jaylen Brown ... Out (Wrist) - The Celtics announced that Brown...
2 Coby White ... Out (Shoulder) - The Bulls announced that Whit...
3 Taurean Prince ... Out (Ankle) - The Cavaliers announced F Taurea...
4 Jamal Murray ... Out (Knee) - Murray is recovering from a torn ...
5 Klay Thompson ... Out (Right Achilles) - Thompson is on track to...
6 James Wiseman ... Out (Knee) - Wiseman is on track to be ready b...
7 T.J. Warren ... Out (Foot) - Warren underwent foot surgery and...
8 Serge Ibaka ... Out (Back) - The Clippers announced Serge Ibak...
9 Kawhi Leonard ... Out (Knee) - The Clippers announced Kawhi Leon...
10 Victor Oladipo ... Out (Knee) - Oladipo could be cleared for full...
11 Donte DiVincenzo ... Out (Foot) - DiVincenzo suffered a tendon inju...
12 Jarrett Culver ... Out (Ankle) - The Timberwolves announced Culve...
13 Markelle Fultz ... Out (Knee) - Fultz will miss the rest of the s...
14 Jonathan Isaac ... Out (Knee) - Isaac is making progress with his...
15 Dario Šarić ... Out (Knee) - The Suns announced that Sario has...
16 Zach Collins ... Out (Ankle) - The Blazers announced that Colli...
17 Pascal Siakam ... Out (Shoulder) - The Raptors announced Pascal ...
18 Deni Avdija ... Out (Leg) - The Wizards announced that Avdija ...
19 Thomas Bryant ... Out (Left knee) - The Wizards announced that B...
[20 rows x 4 columns]
But if you were to iterate it yourself, I'd simply get at the rows (<tr> tags), then get the player name in the <a> tag, and combine it with that row's <td> tags. Then create your dataframe from the list of those:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timezone, timedelta
url = "https://www.basketball-reference.com/friv/injuries.fcgi"
html = urlopen(url)
soup = BeautifulSoup(html)
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
trs = soup.findAll('tr')[1:]
rows = []
for tr in trs:
player_name = tr.find('a').text
data = [player_name] + [x.text for x in tr.find_all('td')]
rows.append(data)
injury_data = pd.DataFrame(rows, columns = headers)
I think you want this (a list of tuples), using zip:
players = ["joe", "bill"]
injuries = ["tooth-ache", "mental break"]
list(zip(players, injuries))
Result:
[('joe', 'tooth-ache'), ('bill', 'mental break')]

PANDAS Web Scraping Multiple Pages

I was working on scraping data using Beautiful soup on multiple pages for the following given website and was able to do it. Can I scrape data for multiple pages using Pandas. Following is the code to scrape a single page and the URL has link to other pages as http://www.example.org/whats-on/calendar?page=3 .
import pandas as pd
url = 'http://www.example.org/whats-on/calendar?page=3'
dframe = pd.read_html(url,header=0)
dframe[0]
dframe[0].to_csv('out.csv')
Simply loop over the range of numbers and append to a list of dataframes. Afterwards, concatenate to one large file. One issue too of your current code is header=0 is the default first row. However, pages do not have column headers. Hence, use header=None and then rename columns.
Below scrapes pages 0 - 3. Extend the loop limit for the other pages.
import pandas as pd
dfs = []
# PAGES 0 - 3 SCRAPE
url = 'http://www.lapl.org/whats-on/calendar?page={}'
for i in range(4):
dframe = pd.read_html(url.format(i), header=None)[0]\
.rename(columns={0:'Date', 1:'Topic', 2:'Location',
3:'People', 4:'Category'})
dfs.append(dframe)
finaldf = pd.concat(dfs)
finaldf.to_csv('Output.csv')
Output
print(finaldf.head())
# Date Topic Location People Category
# 0 Thu, Nov 09, 201710:00am to 12:30pm California Healthier Living : A Chronic Diseas... West Los Angeles Regional Library Seniors Health
# 1 Thu, Nov 09, 201710:00am to 11:30am Introduction to Microsoft WordLearn the basics... North Hollywood Amelia Earhart Regional Library Adults, Job Seekers, Seniors Computer Class
# 2 Thu, Nov 09, 201711:00am Board of Library Commissioners Central Library Adults Meeting
# 3 Thu, Nov 09, 201712:00pm to 1:00pm Tech TryOutCentral Library LobbyDid you know t... Central Library Adults, Teens Computer Class
# 4 Thu, Nov 09, 201712:00pm to 1:30pm Taller de Tejido/ Crochet WorkshopLearn how to... Benjamin Franklin Branch Library Adults, Seniors, Spanish Speakers Arts and Crafts, En Español
Below code will loop through the pages given in the range below and append to the dataframe with the selected fields.
def get_from_website():
Sample = pd.DataFrame()
for num in range(1,6):
website = 'https://weburl/?page=' + str(num)
datalist = pd.read_html(website)
Sample= Sample.append(datalist[0])
Sample.columns=['Field1', 'Field2', 'Field3', 'Field4', 'Field5', 'Field6', 'Time', 'Field7', 'Field8' ]
return Sample

extracting specific lines from multiple lists index error

import urllib.request
import bs4 as bs
import requests
import re
url = " https://www.weather-forecast.com/locations/Kansas-City-
1/forecasts/latest"
request = requests.get(url)
response = request.text
soup = bs.BeautifulSoup(response,'lxml')
for para in soup.find_all('p'):
a = para.text.split('/n')
print(a[1])
I am trying to print for eg. only the line from the list till 94.5W but its giving me a index out of range error.
This is the output i get if i only do :
print(a)
['']
['Kansas City is 232\u2009m above sea level and located at 39.05° N 94.50°
W. Kansas City has a population of 475378. Local time in Kansas City is
CST.']
['Kansas City 1 – 3 Day Weather Forecast Summary: Mostly dry. Freeze-thaw
conditions (max 10°C on Wed afternoon, min -2°C on Thu night). Wind will be
generally light.']
[' Local time in Kansas City: CST']
['View 3 Hour Detailed Kansas City Weather Forecast for Today']
['Kansas City 4 – 7 Day Weather Forecast Summary: Mostly dry. Very mild (max
12°C on Mon afternoon, min 1°C on Sun night). Winds decreasing (fresh winds
from the SSW on Sat morning, light winds from the ENE by Sun night).']
['Kansas City 7 – 10 Day Weather Forecast Summary: Light rain (total 2mm),
mostly falling on Wed morning. Very mild (max 17°C on Wed afternoon, min 7°C
on Thu night). Wind will be generally light.']
['© 2017 Meteo365.com']
But i only want to print a specific line that i want
You are declaring the a variable inside the loop and printing it, hence the list just has one element and it gets overwritten everytime. Moving the a declaration and the print() outside the for loop should do the trick
import bs4 as bs
import requests
import re
url = " https://www.weather-forecast.com/locations/Kansas-City-1/forecasts/latest"
request = requests.get(url)
response = request.text
soup = bs.BeautifulSoup(response,'lxml')
a=[]
for para in soup.find_all('p'):
a.append(para.text.split('/n'))
print(a[1])

Get content of table in BeautifulSoup

I have the following table on a website which I am extracting with BeautifulSoup
This is the url (I have also attached a picture
Ideally I would like to have each company in one row in csv however I am getting it in different rows. Please see picture attached.
I would like it to have it like in field "D" but I am getting it in A1,A2,A3...
This is the code I am using to extract:
def _writeInCSV(text):
print "Writing in CSV File"
with open('sara.csv', 'wb') as csvfile:
#spamwriter = csv.writer(csvfile, delimiter='\t',quotechar='\n', quoting=csv.QUOTE_MINIMAL)
spamwriter = csv.writer(csvfile, delimiter='\t',quotechar="\n")
for item in text:
spamwriter.writerow([item])
read_list=[]
initial_list=[]
url="http://www.nse.com.ng/Issuers-section/corporate-disclosures/corporate-actions/closure-of-register"
r=requests.get(url)
soup = BeautifulSoup(r._content, "html.parser")
#gdata_even=soup.find_all("td", {"class":"ms-rteTableEvenRow-3"})
gdata_even=soup.find_all("td", {"class":"ms-rteTable-default"})
for item in gdata_even:
print item.text.encode("utf-8")
initial_list.append(item.text.encode("utf-8"))
print ""
_writeInCSV(initial_list)
Can someone help please ?
Here is the idea:
read the header cells from the table
read all the other rows from the table
zip all the data row cells with headers producing a list of dictionaries
use csv.DictWriter() to dump to csv
Implementation:
import csv
from pprint import pprint
from bs4 import BeautifulSoup
import requests
url = "http://www.nse.com.ng/Issuers-section/corporate-disclosures/corporate-actions/closure-of-register"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
rows = soup.select("table.ms-rteTable-default tr")
headers = [header.get_text(strip=True).encode("utf-8") for header in rows[0].find_all("td")]
data = [dict(zip(headers, [cell.get_text(strip=True).encode("utf-8") for cell in row.find_all("td")]))
for row in rows[1:]]
# see what the data looks like at this point
pprint(data)
with open('sara.csv', 'wb') as csvfile:
spamwriter = csv.DictWriter(csvfile, headers, delimiter='\t', quotechar="\n")
for row in data:
spamwriter.writerow(row)
Since #alecxe has already provided an amazing answer, here's another take using the pandas library.
import pandas as pd
url = "http://www.nse.com.ng/Issuers-section/corporate-disclosures/corporate-actions/closure-of-register"
tables = pd.read_html(url)
tb1 = tables[0] # Get the first table.
tb1.columns = tb1.iloc[0] # Assign the first row as header.
tb1 = tb1.iloc[1:] # Drop the first row.
tb1.reset_index(drop=True, inplace=True) # Reset the index.
print tb1.head() # Print first 5 rows.
# tb1.to_csv("table1.csv") # Export to CSV file.
Result:
In [5]: runfile('C:/Users/.../.spyder2/temp.py', wdir='C:/Users/.../.spyder2')
0 Company Dividend Bonus Closure of Register \
0 Nigerian Breweries Plc N3.50 Nil 5th - 11th March 2015
1 Forte Oil Plc N2.50 1 for 5 1st – 7th April 2015
2 Nestle Nigeria N17.50 Nil 27th April 2015
3 Greif Nigeria Plc 60 kobo Nil 25th - 27th March 2015
4 Guaranty Bank Plc N1.50 (final) Nil 17th March 2015
0 AGM Date Payment Date
0 13th May 2015 14th May 2015
1 15th April 2015 22nd April 2015
2 11th May 2015 12th May 2015
3 28th April 2015 5th May 2015
4 ​31st March 2015 31st March 2015
In [6]:

Categories

Resources