How can I save scraped date from soup object into CSV? - python

I am looking to only save scraped date into a CSV file.
This is the scraped data and code:
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-
SkillsNetwork/labs/datasets/Programming_Languages.html"
from bs4 import BeautifulSoup
import requests
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
table = soup.find('table')
for row in table.find_all('tr'):
cols = row.find_all('td')
programing_language = cols[1].getText()
salary = cols[3].getText()
print("{}--->{}".format(programing_language,salary))

Here is the solution.
import pandas as pd
from bs4 import BeautifulSoup
import requests
data=[]
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/Programming_Languages.html"
from bs4 import BeautifulSoup
import requests
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
table = soup.find('table')
for row in table.find_all('tr'):
cols = row.find_all('td')
programing_language = cols[1].getText()
salary = cols[3].getText()
data.append([programing_language,salary])
#print("{}--->{}".format(programing_language,salary))
cols=['programing_language','salary']
df = pd.DataFrame(data,columns=cols)
df.to_csv("data.csv", index=False)

For a lightweight solution you can just use csv. Ignore headers row by using tr:nth-child(n+2). This nth-child range selector selects from the second tr. Then within a loop over the subsequent rows, select for the second and fourth columns as follows:
from bs4 import BeautifulSoup as bs
import requests, csv
response = requests.get('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/Programming_Languages.html',
headers={'User-Agent': 'Mozilla/5.0'})
soup = bs(response.content, 'lxml')
with open("programming.csv", "w", encoding="utf-8-sig", newline='') as f:
w = csv.writer(f, delimiter=",", quoting=csv.QUOTE_MINIMAL)
w.writerow(["Language", "Average Annual Salary"])
for item in soup.select('tr:nth-child(n+2)'):
w.writerow([item.select_one('td:nth-child(2)').text,
item.select_one('td:nth-child(4)').text])

Related

Using Beautiful Soup on multiple URLs

I have searched through a lot of similar questions, but I'm unable to resolve the issue with the code below.
I am trying to scrape the same information from 2 separate URLs.
There is no issue when I scrape 1 URL (code 1). I then attempt to for loop through multiple URLs (code 2) and it throws this error:
ResultSet object has no attribute 'find_all'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Is it a case that the line where the error is returned (highlighted below) should not be included within the For Loop? (I have tried this unsuccessfully)
Could someone please educate me in why this is not working (my guess would be that the structure is wrong in someway - but I've been unable to adjust it correctly), or if this is infact not the optimal method at all
First code:
import csv
import pandas as pd
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
import numpy as np
import re
url = "https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-royal-challengers-bangalore-42nd-match-734013/full-scorecard"
url_contents = ureq(url) #opening the URL
soup = soup(url_contents,"html.parser") #parse the
batsmen = soup.find_all("table", { "class":["table batsman"]})
bowlers = soup.find_all("table", { "class":["table bowler"]})
for batsman in batsmen[0]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (batsmen)
for x in batsman:
rows = batsman.find_all('tr')[:-2] #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
for bowler in bowlers[1]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (bowlers)
for x in bowler:
rows = bowler.find_all('tr') #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
Second code:
import csv # to do operations on CSV
import pandas as pd # file operations
from bs4 import BeautifulSoup as soup #Scraping tool
from urllib.request import urlopen as ureq # For requesting data from link
import numpy as np
import re
urls = ["https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-royal-challengers-bangalore-42nd-match-734013/full-scorecard",
"https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-kolkata-knight-riders-21st-match-733971/full-scorecard"]
for url in urls:
url_contents = ureq(url) #opening the URL
soup = soup(url_contents,"html.parser") #parse the
**batsmen = soup.find_all("table", { "class":["table batsman"]})** #error here
bowlers = soup.find_all("table", { "class":["table bowler"]})
for batsman in batsmen[0]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (batsmen)
for x in batsman:
rows = batsman.find_all('tr')[:-2] #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
for bowler in bowlers[1]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (bowlers)
for x in bowler:
rows = bowler.find_all('tr') #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
Your problem is because you use the same name soup for class/function soup(...) and for result soup = ... - and you run it in loop.
from bs4 import BeautifulSoup as soup
for url in urls:
soup = soup(...)
In first loop all work correctly but class/function soup() is replaces by result soup = ... and in next loop it tries to use result soup as a class/function - and this makes problem.
In first code you run soup = soup() only once so it makes no problem.
If you use different names - ie. BeautifoulSoup instead of soup - then it will work
from bs4 import BeautifulSoup
for url in urls:
soup = BeautifulSoup(...)
BTW:
In second code you have wrong indentations - you should run for batsman in ... and for bowler in ... inside for url in urls: but you run it outside (after exiting from loop for url in urls:) and this will give you results only for last url
You can use request lib and try this
import requests as req
from bs4 import BeautifulSoup
urls = ["https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-royal-challengers-bangalore-42nd-match-734013/full-scorecard",
"https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-kolkata-knight-riders-21st-match-733971/full-scorecard"]
for url in urls:
otp = req.get(url)
if otp.ok:
soup = BeautifulSoup(otp.text,'lxml')
batsmen = soup.find_all('table', {'class': 'table batsman'})
bowlers = soup.find_all('table', {'class': 'table bowler'})
for bat in batsmen:
print(bat.find_all('td')) # here you can use find/find_all method
for bowl in bowlers:
print(bowl.find_all('td')) # here you can use find/find_all method

Scraping a table: IndexError: list index out of range

I am new to python. I am using it in a jupyter notebooks to scrape a table from Wikipedia. All the code I wrote works, except when I want to put the information into a csv file. The error that appears is "Index list index out of range".
Here is the code:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
s = requests.Session()
response = s.get(url, timeout=10)
response
table_id = 'main'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify().encode('UTF-8'))
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr'):
print(row)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
print(col[0].find('a').contents[0])
print(col[1].string) #name
print(col[2].string)
print(col[3].string)
print(col[4].string)
print(col[5].find(text=True))
csvfile = open('population.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter=',')
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
csvwriter.writerow(headers)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
country = col[0].find('a').contents[0]
continent = col[1].string
subregion = col[2].string
population_2018 = col[3].string
population_2019 = col[4].string
change = col[5].find(text=True)
parsed_row = (country, continent, subregion, population_2018, population_2019, change)
csvwriter.writerow(parsed_row)
csvfile.close()
Thank you very much!
I have two part answers. The easiest way to accomplish your task and where in your code the error is.
Let pandas handle the requests, BeautifulSoup and csv for you.
import pandas as pd
URI = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
df = pd.read_html(URI)[3]
df.to_csv('population.csv', index=False)
pandas has .read_html that returns a list of all tables in the webpage. Your table was at index 3. With that, I saved it with .to_csv.
With .read_html, you can pass the attributes of a specific table e.g. attrs = {'id': 'table'}
# the table is now at index 0
df = pd.read_html(URI, attrs={'id':'main'})[0]
You can also specify the parser that will be used by BeautifulSoup that .read_html calls:
df = pd.read_html(URI, attrs={'id':'main'}, flavor='lxml')[0]
# 'lxml' is known for speed. But you can use `html.parser` if `lxml` or `html5lib` are not installed.
See more documentation .read_html
Update: Debugging You’re Code
The error from your code is from empty col. using if conditions solves the problem:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
s = requests.Session()
response = s.get(url, timeout=10)
response
table_id = 'main'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#print(soup.prettify().encode('UTF-8'))
csvfile = open('population.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter=',')
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
csvwriter.writerow(headers)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
# this is all that was missing
if col:
country = col[0].find('a')['title']
continent = col[1].string
subregion = col[2].string
population_2018 = col[3].string
population_2019 = col[4].string
change = col[5].find(text=True)
parsed_row = (country, continent, subregion, population_2018, population_2019, change)
csvwriter.writerow(parsed_row)
csvfile.close()
Prayson W. Daniel has already given the answer, and I offer another way.
import requests
from simplified_scrapy import SimplifiedDoc, utils, req
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
s = requests.Session()
res = s.get(url, timeout=10)
rows = []
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
rows.append(headers)
table_id = 'main'
doc = SimplifiedDoc(res.text)
table = doc.select('table#'+table_id) # Get the table by id.
trs = table.tbody.children.children[1:] # Get all data rows
for tr in trs:
row = [tr[0].a.text] # First col, get first link
row.extend(tr.text[1:]) # Left cols
rows.append(row)
utils.save2csv('test_wiki.csv', rows) # Save data to csv

How to specify table for BeautifulSoup to find?

I'm trying to grab the table on this page https://nces.ed.gov/collegenavigator/?id=139755 under the Net Price expandable object. I've gone through tutorials for BS4, but I get so confused by the complexity of the html in this case that I can't figure out what syntax and which tags to use.
Here's a screenshot of the table and html I'm trying to get:
This is what I have so far. How do I add other tags to narrow down the results to just that one table?
import requests
from bs4 import BeautifulSoup
page = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = BeautifulSoup(page.text, 'html.parser')
soup = soup.find(id="divctl00_cphCollegeNavBody_ucInstitutionMain_ctl02")
print(soup.prettify())
Once I can parse that data, I will format into a dataframe with pandas.
In this case I'd probably just use pandas to retrieve all tables then index in for appropriate
import pandas as pd
table = pd.read_html('https://nces.ed.gov/collegenavigator/?id=139755')[10]
print(table)
If you are worried about future ordering you could loop the tables returned by read_html and test for presence of a unique string to identify table or use bs4 functionality of :has , :contains (bs4 4.7.1+) to identify the right table to then pass to read_html or continue handling with bs4
import pandas as pd
from bs4 import BeautifulSoup as bs
r = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = bs(r.content, 'lxml')
table = pd.read_html(str(soup.select_one('table:has(td:contains("Average net price"))')))
print(table)
ok , maybe this can help you , I add pandas
import requests
from bs4 import BeautifulSoup
import pandas as pd
page = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = BeautifulSoup(page.text, 'html.parser')
div = soup.find("div", {"id": "divctl00_cphCollegeNavBody_ucInstitutionMain_ctl02"})
table = div.findAll("table", {"class": "tabular"})[1]
l = []
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
if td:
row = [i.text for i in td]
l.append(row)
df=pd.DataFrame(l, columns=["AVERAGE NET PRICE BY INCOME","2015-2016","2016-2017","2017-2018"])
print(df)
Here is a basic script to scrape that first table in that accordion:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://nces.ed.gov/collegenavigator/?id=139755#netprc"
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
parent_table = soup.find('div', attrs={'id':'netprc'})
desired_table = parent_table.find('table')
print(desired_table.prettify())
I assume you only want the values within the table so I did an overkill version of this as well that will combine the column names and values together:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://nces.ed.gov/collegenavigator/?id=139755#netprc"
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
parent_table = soup.find('div', attrs={'id':'netprc'})
desired_table = parent_table.find('table')
header_row = desired_table.find_all('th')
headers = []
for header in header_row:
header_text = header.get_text()
headers.append(header_text)
money_values = []
data_row =desired_table.find_all('td')
for rows in data_row:
row_text = rows.get_text()
money_values.append(row_text)
for yrs,money in zip(headers,money_values):
print(yrs,money)
This will print out the following:
Average net price
2015-2016 $13,340
2016-2017 $15,873
2017-2018 $16,950

Beautiful Soup:Scrape Table Data

I'm looking to extract table data from the url below. Specifically I would like to extract the data in first column. When I run the code below, the data in the first column repeats multiple times. How can I get the values to show only once as it appears in the table?
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/page3.html').read()
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table',{'id':'giftList'})
rows = table.find_all('tr')
for row in rows:
data = row.find_all('td')
for cell in data:
print(data[0].text)
Try this:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/page3.html').read()
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table',{'id':'giftList'})
rows = table.find_all('tr')
for row in rows:
data = row.find_all('td')
if (len(data) > 0):
cell = data[0]
print(cell.text)
Using requests module in combination with selectors you can try like the following as well:
import requests
from bs4 import BeautifulSoup
link = 'http://www.pythonscraping.com/pages/page3.html'
soup = BeautifulSoup(requests.get(link).text, 'lxml')
for table in soup.select('table#giftList tr')[1:]:
cell = table.select_one('td').get_text(strip=True)
print(cell)
Output:
Vegetable Basket
Russian Nesting Dolls
Fish Painting
Dead Parrot
Mystery Box

How can I scrape this table more efficiently?

Okay, I have built a program to scrape yahoo finance. I want the historical prices of a certain stock. I then want it to be written to an excel spreadsheet. It is doing everything the way it's supposed to, but it gives me ALL of the data on the whole page! I need just the data in the table. Thanks.
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import requests
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
playerdatasaved=""
soup = make_soup("https://finance.yahoo.com/q/hp?s=USO+Historical+Prices")
for record in soup.findAll('tr'):
playerdata=""
for data in record.findAll('td'):
playerdata=playerdata+","+data.text
if len(playerdata)!=0:
playerdatasaved = playerdatasaved + "\n" + playerdata[1:]
header="Open,Close,High,Low"
file = open(os.path.expanduser("Uso.csv"),"wb")
file.write(bytes(header, encoding="ascii",errors='ignore'))
file.write(bytes(playerdatasaved, encoding="ascii",errors='ignore'))
print(playerdatasaved)
To get the table of data:
soup = make_soup("https://finance.yahoo.com/q/hp?s=USO+Historical+Prices")
table = [[cell.text for row in soup.findAll('tr')] for cell in soup.findAll('td')]
To write out the table of data to a file:
import csv
with open("output.csv", "wb") as f:
writer = csv.writer(f)
writer.writerows(table)

Categories

Resources