Scraping a table: IndexError: list index out of range - python

I am new to python. I am using it in a jupyter notebooks to scrape a table from Wikipedia. All the code I wrote works, except when I want to put the information into a csv file. The error that appears is "Index list index out of range".
Here is the code:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
s = requests.Session()
response = s.get(url, timeout=10)
response
table_id = 'main'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify().encode('UTF-8'))
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr'):
print(row)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
print(col[0].find('a').contents[0])
print(col[1].string) #name
print(col[2].string)
print(col[3].string)
print(col[4].string)
print(col[5].find(text=True))
csvfile = open('population.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter=',')
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
csvwriter.writerow(headers)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
country = col[0].find('a').contents[0]
continent = col[1].string
subregion = col[2].string
population_2018 = col[3].string
population_2019 = col[4].string
change = col[5].find(text=True)
parsed_row = (country, continent, subregion, population_2018, population_2019, change)
csvwriter.writerow(parsed_row)
csvfile.close()
Thank you very much!

I have two part answers. The easiest way to accomplish your task and where in your code the error is.
Let pandas handle the requests, BeautifulSoup and csv for you.
import pandas as pd
URI = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
df = pd.read_html(URI)[3]
df.to_csv('population.csv', index=False)
pandas has .read_html that returns a list of all tables in the webpage. Your table was at index 3. With that, I saved it with .to_csv.
With .read_html, you can pass the attributes of a specific table e.g. attrs = {'id': 'table'}
# the table is now at index 0
df = pd.read_html(URI, attrs={'id':'main'})[0]
You can also specify the parser that will be used by BeautifulSoup that .read_html calls:
df = pd.read_html(URI, attrs={'id':'main'}, flavor='lxml')[0]
# 'lxml' is known for speed. But you can use `html.parser` if `lxml` or `html5lib` are not installed.
See more documentation .read_html
Update: Debugging You’re Code
The error from your code is from empty col. using if conditions solves the problem:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
s = requests.Session()
response = s.get(url, timeout=10)
response
table_id = 'main'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#print(soup.prettify().encode('UTF-8'))
csvfile = open('population.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter=',')
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
csvwriter.writerow(headers)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
# this is all that was missing
if col:
country = col[0].find('a')['title']
continent = col[1].string
subregion = col[2].string
population_2018 = col[3].string
population_2019 = col[4].string
change = col[5].find(text=True)
parsed_row = (country, continent, subregion, population_2018, population_2019, change)
csvwriter.writerow(parsed_row)
csvfile.close()

Prayson W. Daniel has already given the answer, and I offer another way.
import requests
from simplified_scrapy import SimplifiedDoc, utils, req
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
s = requests.Session()
res = s.get(url, timeout=10)
rows = []
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
rows.append(headers)
table_id = 'main'
doc = SimplifiedDoc(res.text)
table = doc.select('table#'+table_id) # Get the table by id.
trs = table.tbody.children.children[1:] # Get all data rows
for tr in trs:
row = [tr[0].a.text] # First col, get first link
row.extend(tr.text[1:]) # Left cols
rows.append(row)
utils.save2csv('test_wiki.csv', rows) # Save data to csv

Related

How can I save scraped date from soup object into CSV?

I am looking to only save scraped date into a CSV file.
This is the scraped data and code:
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-
SkillsNetwork/labs/datasets/Programming_Languages.html"
from bs4 import BeautifulSoup
import requests
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
table = soup.find('table')
for row in table.find_all('tr'):
cols = row.find_all('td')
programing_language = cols[1].getText()
salary = cols[3].getText()
print("{}--->{}".format(programing_language,salary))
Here is the solution.
import pandas as pd
from bs4 import BeautifulSoup
import requests
data=[]
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/Programming_Languages.html"
from bs4 import BeautifulSoup
import requests
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
table = soup.find('table')
for row in table.find_all('tr'):
cols = row.find_all('td')
programing_language = cols[1].getText()
salary = cols[3].getText()
data.append([programing_language,salary])
#print("{}--->{}".format(programing_language,salary))
cols=['programing_language','salary']
df = pd.DataFrame(data,columns=cols)
df.to_csv("data.csv", index=False)
For a lightweight solution you can just use csv. Ignore headers row by using tr:nth-child(n+2). This nth-child range selector selects from the second tr. Then within a loop over the subsequent rows, select for the second and fourth columns as follows:
from bs4 import BeautifulSoup as bs
import requests, csv
response = requests.get('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/Programming_Languages.html',
headers={'User-Agent': 'Mozilla/5.0'})
soup = bs(response.content, 'lxml')
with open("programming.csv", "w", encoding="utf-8-sig", newline='') as f:
w = csv.writer(f, delimiter=",", quoting=csv.QUOTE_MINIMAL)
w.writerow(["Language", "Average Annual Salary"])
for item in soup.select('tr:nth-child(n+2)'):
w.writerow([item.select_one('td:nth-child(2)').text,
item.select_one('td:nth-child(4)').text])

Appending results with Panda and BeautifulSoup

PROBLEM: I have a list of sites that I want BS and Pandas to grab a data table for. I want to add all the iterative results to the same xlsx or csv file.
My current code below will iterate over each of the 3 sites, but the final product is just the last page to get scraped. Removing my export function and just printing df, I can see all 3 pages of data; so I'm not sure how to correctly append each iteration into my output file.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import gmtime, strftime
#Pass in the URL
url = ["https://www.nfl.com/standings/league/2021/reg", "https://www.nfl.com/standings/league/2020/reg", "https://www.nfl.com/standings/league/2019/reg"]
for site in url:
#Load the page html
page = requests.get(site)
soup = BeautifulSoup(page.text, 'lxml')
# Get all the table data
table = soup.find('table', {'summary':'Standings - Detailed View'})
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
#Dataframe the headers into columns
df = pd.DataFrame(columns = headers)
# TR for the rows, TD for the values
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
#Write the collected data out to an Excel file
dateTime = strftime("%d%b%Y_%H%M", gmtime())
writer = pd.ExcelWriter(dateTime + "Z" + ".xlsx")
df.to_excel(writer)
writer.save()
print('[*] Data successfully written to Excel File.')
Try the following. You need to capture all the dataframes from each url, then concatenate them, then write the new df to excel. This should work, but untested. See comments inline.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import gmtime, strftime
#Pass in the URL
url = ["https://www.nfl.com/standings/league/2021/reg", "https://www.nfl.com/standings/league/2020/reg", "https://www.nfl.com/standings/league/2019/reg"]
df_hold_list = [] #collect each dataframe separately
for site in url:
#Load the page html
page = requests.get(site)
soup = BeautifulSoup(page.text, 'lxml')
# Get all the table data
table = soup.find('table', {'summary':'Standings - Detailed View'})
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
#Dataframe the headers into columns
df = pd.DataFrame(columns = headers)
# TR for the rows, TD for the values
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
df_hold_list.append(df) # add the dfs to the list
final_df = pd.concat(df_hold_list, axis=1) # put them together-check that axis=1 is correct, otherwise axis=0
# move this out of loop
#Write the collected data out to an Excel file
dateTime = strftime("%d%b%Y_%H%M", gmtime())
writer = pd.ExcelWriter(dateTime + "Z" + ".xlsx")
final_df.to_excel(writer) # write final_df to excel
writer.save()
print('[*] Data successfully written to Excel File.')

Need help in scraping information from multiple webpages and import to csv file in tabular form - Python

I have been working on webscraping the infobox information on Wikipedia. This is the following code that I have been using:
import requests
import csv
from bs4 import BeautifulSoup
URL = ['https://en.wikipedia.org/wiki/Workers_Credit_Union','https://en.wikipedia.org/wiki/San_Diego_County_Credit_Union',
'https://en.wikipedia.org/wiki/USA_Federal_Credit_Union','https://en.wikipedia.org/wiki/Commonwealth_Credit_Union',
'https://en.wikipedia.org/wiki/Center_for_Community_Self-Help','https://en.wikipedia.org/wiki/ESL_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/State_Employees_Credit_Union','https://en.wikipedia.org/wiki/United_Heritage_Credit_Union']
for url in URL:
headers=[]
rows=[]
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',class_ ='infobox')
credit_union_name= soup.find('h1', id = "firstHeading")
header_tags = table.find_all('th')
headers = [header.text.strip() for header in header_tags]
data_rows = table.find_all('tr')
for row in data_rows:
value = row.find_all('td')
beautified_value = [dp.text.strip() for dp in value]
if len(beautified_value) == 0:
continue
rows.append(beautified_value)
rows.append("")
rows.append([credit_union_name.text.strip()])
rows.append([url])
with open(r'credit_unions.csv','a+',newline="") as output:
writer=csv.writer(output)
writer.writerow(headers)
writer.writerow(rows)
However, I checked the csv file and information is not being presented in tabular form. The scraped elements are being stored in nested lists instead of a singular list. I need the scraped information of each URL to be stored in a singular list and print the list in csv file in tabular form with the headings. Need help regarding this.
The infoboxes have different structures and labels. So I think the best way to solve this is to use dicts and a DictWriter.
import requests
import csv
from bs4 import BeautifulSoup
URL = ['https://en.wikipedia.org/wiki/Workers_Credit_Union',
'https://en.wikipedia.org/wiki/San_Diego_County_Credit_Union',
'https://en.wikipedia.org/wiki/USA_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/Commonwealth_Credit_Union',
'https://en.wikipedia.org/wiki/Center_for_Community_Self-Help',
'https://en.wikipedia.org/wiki/ESL_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/State_Employees_Credit_Union',
'https://en.wikipedia.org/wiki/United_Heritage_Credit_Union']
csv_headers = set()
csv_rows = []
for url in URL:
csv_row = {}
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
credit_union_name = soup.find('h1', id="firstHeading")
table = soup.find('table', class_='infobox')
data_rows = table.find_all('tr')
for data_row in data_rows:
label = data_row.find('th')
value = data_row.find('td')
if label is None or value is None:
continue
beautified_label = label.text.strip()
beautified_value = value.text.strip()
csv_row[beautified_label] = beautified_value
csv_headers.add(beautified_label)
csv_row["name"] = credit_union_name.text.strip()
csv_row["url"] = url
csv_rows.append(csv_row)
with open(r'credit_unions.csv', 'a+', newline="") as output:
headers = ["name", "url"]
headers += sorted(csv_headers)
writer = csv.DictWriter(output, fieldnames=headers)
writer.writeheader()
writer.writerows(csv_rows)

Python Web Scraping: Output to csv

I'm doing some progress with web scraping however I still need some help to perform some operations:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
# soup = BeautifulSoup(requests.get(converturl).content, 'html.parser')
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
out = []
for tr in soup.select('.col-md-4 tbody tr'):
On the class col-md-4 I know there are 3 tables I want to generate a csv which as an output has three values: first name, last name, and for the last value I want the header name of the table.
first name, last name, header table
Any help would be appreciated.
This is what I have done on my own:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
filename = url.rsplit('/', 1)[1] + '.csv'
tables = soup.select('.col-md-4 table')
rows = []
for tr in tables:
t = tr.get_text(strip=True, separator='|').split('|')
rows.append(t)
df = pd.DataFrame(rows)
print(df)
df.to_csv(filename)
Thanks,
This might work:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
tables = soup.select('.col-md-4 table')
rows = []
for table in tables:
cleaned = list(table.stripped_strings)
header, names = cleaned[0], cleaned[1:]
data = [name.split(', ') + [header] for name in names]
rows.extend(data)
result = pd.DataFrame.from_records(rows, columns=['surname', 'name', 'table'])
You need to first iterate through each table you want to scrape, then for each table, get its header and rows of data. For each row of data, you want to parse out the First Name and Last Name (along with the header of the table).
Here's a verbose working example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
out = []
# Iterate through each of the three tables
for table in soup.select(".col-md-4 table"):
# Grab the header and rows from the table
header = table.select("thead th")[0].text.strip()
rows = [s.text.strip() for s in table.select("tbody tr")]
t = [] # This list will contain the rows of data for this table
# Iterate through rows in this table
for row in rows:
# Split by comma (last_name, first_name)
split = row.split(",")
last_name = split[0].strip()
first_name = split[1].strip()
# Create the row of data
t.append([first_name, last_name, header])
# Convert list of rows to a DataFrame
df = pd.DataFrame(t, columns=["first_name", "last_name", "table_name"])
# Append to list of DataFrames
out.append(df)
# Write to CSVs...
out[0].to_csv("first_table.csv", index=None) # etc...
Whenever you're web scraping, I highly recommend using strip() on all of the text you parse to make sure you don't have superfluous spaces in your data.
I hope this helps!

How to get the first column values in the wikipedia table using python bs4?

I'm trying to web scrape a data table in wikipedia using python bs4. But I'm stuck with this problem. When getting the data values my code is not getting the first column or index zero. I feel there something wrong with the index but I can't figure it out. Please help. See the
response_obj = requests.get('https://en.wikipedia.org/wiki/Metro_Manila').text
soup = BeautifulSoup(response_obj,'lxml')
Neighborhoods_MM_Table = soup.find('table', {'class':'wikitable sortable'})
rows = Neighborhoods_MM_Table.select("tbody > tr")[3:8]
cities = []
for row in rows:
city = {}
tds = row.select('td')
city["City or Municipal"] = tds[0].text.strip()
city["%_Population"] = tds[1].text.strip()
city["Population"] = float(tds[2].text.strip().replace(",",""))
city["area_sqkm"] = float(tds[3].text.strip().replace(",",""))
city["area_sqm"] = float(tds[4].text.strip().replace(",",""))
city["density_sqm"] = float(tds[5].text.strip().replace(",",""))
city["density_sqkm"] = float(tds[6].text.strip().replace(",",""))
cities.append(city)
print(cities)
df=pd.DataFrame(cities)
df.head()
import requests
from bs4 import BeautifulSoup
import pandas as pd
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [item.get_text(strip=True) for item in soup.findAll(
"td", style="text-align:right") if "%" in item.text] + [""]
df = pd.read_html(r.content, header=0)[5]
df = df.iloc[1: -1]
df['Population (2015)[3]'] = target
print(df)
df.to_csv("data.csv", index=False)
main("https://en.wikipedia.org/wiki/Metro_Manila")
Output: view-online

Categories

Resources