Scraping data from a table and storing it in csv file - python

I would want to scrap the data from this website and store it in csv file in this manner.
But when I try to scrap the data it is not stored in exact format. All the data is stored in the 1st column itself. I have no idea how to approach this problem.
Link : https://pce.ac.in/students/bachelors-students/
Code:
import csv # file operations
from bs4 import BeautifulSoup as soup # lib for pulling data from html/xmlsites
from urllib.request import urlopen as uReq # lib for sending and rec info over http
Url = 'https://pce.ac.in/students/bachelors-students/'
pageHtml = uReq(Url)
soup = soup(pageHtml,"html.parser") #parse the html
table = soup.find_all("table", { "class" : "tablepress tablepress-id-10 tablepress-responsive-phone" })
f = csv.writer(open('BEPillaiDepart.csv', 'w'))
f.writerow(['Choice Code', 'Course Name', 'Year of Establishment','Sanctioned Strength']) # headers
for x in table:
data=""
table_body = x.find('tbody') #find tbody tag
rows = table_body.find_all('tr') #find all tr tag
for tr in rows:
cols = tr.find_all('td') #find all td tags
for td in cols:
data=data+ "\n"+ td.text.strip()
f.writerow([data])
#print(data)

Create variable data in each tr label,you can try like this:
import csv # file operations
from bs4 import BeautifulSoup as soup # lib for pulling data from html/xmlsites
from urllib.request import urlopen as uReq # lib for sending and rec info over http
Url = 'https://pce.ac.in/students/bachelors-students/'
pageHtml = uReq(Url)
soup = soup(pageHtml,"html.parser") #parse the html
table = soup.find_all("table", { "class" : "tablepress tablepress-id-10 tablepress-responsive-phone" })
with open('BEPillaiDepart.csv', 'w',newline='') as csvfile:
f = csv.writer(csvfile)
f.writerow(['Choice Code', 'Course Name', 'Year of Establishment','Sanctioned Strength']) # headers
for x in table:
table_body = x.find('tbody') #find tbody tag
rows = table_body.find_all('tr') #find all tr tag
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)

If you search the meaning of csv, you would find it means comma separated values, however I don't see any commas in your text while appending it to the file.

Related

How can I save scraped date from soup object into CSV?

I am looking to only save scraped date into a CSV file.
This is the scraped data and code:
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-
SkillsNetwork/labs/datasets/Programming_Languages.html"
from bs4 import BeautifulSoup
import requests
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
table = soup.find('table')
for row in table.find_all('tr'):
cols = row.find_all('td')
programing_language = cols[1].getText()
salary = cols[3].getText()
print("{}--->{}".format(programing_language,salary))
Here is the solution.
import pandas as pd
from bs4 import BeautifulSoup
import requests
data=[]
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/Programming_Languages.html"
from bs4 import BeautifulSoup
import requests
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
table = soup.find('table')
for row in table.find_all('tr'):
cols = row.find_all('td')
programing_language = cols[1].getText()
salary = cols[3].getText()
data.append([programing_language,salary])
#print("{}--->{}".format(programing_language,salary))
cols=['programing_language','salary']
df = pd.DataFrame(data,columns=cols)
df.to_csv("data.csv", index=False)
For a lightweight solution you can just use csv. Ignore headers row by using tr:nth-child(n+2). This nth-child range selector selects from the second tr. Then within a loop over the subsequent rows, select for the second and fourth columns as follows:
from bs4 import BeautifulSoup as bs
import requests, csv
response = requests.get('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/Programming_Languages.html',
headers={'User-Agent': 'Mozilla/5.0'})
soup = bs(response.content, 'lxml')
with open("programming.csv", "w", encoding="utf-8-sig", newline='') as f:
w = csv.writer(f, delimiter=",", quoting=csv.QUOTE_MINIMAL)
w.writerow(["Language", "Average Annual Salary"])
for item in soup.select('tr:nth-child(n+2)'):
w.writerow([item.select_one('td:nth-child(2)').text,
item.select_one('td:nth-child(4)').text])

Extract table from CSV in Python with BS

I have a huge HTML file which contains many tables and I want to save it as CSV file.
I have a code to export all tables from an html file but when I run it, nothing happens! Is there anyway to troubleshoot this? I am using BS and panda.
It just opens the file and looks for the table.
import os
import sys
import pandas as pd
from bs4 import BeautifulSoup as bs
path = 'index.html'
soup = bs(open(path),'html.parser')
def get_all_tables(soup):
"""Extracts and returns all tables in a soup object"""
return soup.find_all("table")
def get_table_headers(table):
"""Given a table soup, returns all the headers"""
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
return headers
def get_table_rows(table):
"""Given a table, returns all its rows"""
rows = []
for tr in table.find_all("tr")[1:]:
cells = []
# grab all td tags in this table row
tds = tr.find_all("td")
if len(tds) == 0:
# if no td tags, search for th tags
# can be found especially in wikipedia tables below the table
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
# use regular td tags
for td in tds:
cells.append(td.text.strip())
rows.append(cells)
return rows
def save_as_csv(table_name, headers, rows):
pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")
def main(path):
# get the soup
soup = get_soup(path)
# extract all the tables from the file
tables = get_all_tables(soup)
print(f"[+] Found a total of {len(tables)} tables.")
# iterate over all tables
for i, table in enumerate(tables, start=1):
# get the table headers
headers = get_table_headers(table)
# get all the rows of the table
rows = get_table_rows(table)
# save table as csv file
table_name = f"table-{i}"
print(f"[+] Saving {table_name}")
save_as_csv(table_name, headers, rows)

Using Beautiful Soup on multiple URLs

I have searched through a lot of similar questions, but I'm unable to resolve the issue with the code below.
I am trying to scrape the same information from 2 separate URLs.
There is no issue when I scrape 1 URL (code 1). I then attempt to for loop through multiple URLs (code 2) and it throws this error:
ResultSet object has no attribute 'find_all'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Is it a case that the line where the error is returned (highlighted below) should not be included within the For Loop? (I have tried this unsuccessfully)
Could someone please educate me in why this is not working (my guess would be that the structure is wrong in someway - but I've been unable to adjust it correctly), or if this is infact not the optimal method at all
First code:
import csv
import pandas as pd
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
import numpy as np
import re
url = "https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-royal-challengers-bangalore-42nd-match-734013/full-scorecard"
url_contents = ureq(url) #opening the URL
soup = soup(url_contents,"html.parser") #parse the
batsmen = soup.find_all("table", { "class":["table batsman"]})
bowlers = soup.find_all("table", { "class":["table bowler"]})
for batsman in batsmen[0]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (batsmen)
for x in batsman:
rows = batsman.find_all('tr')[:-2] #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
for bowler in bowlers[1]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (bowlers)
for x in bowler:
rows = bowler.find_all('tr') #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
Second code:
import csv # to do operations on CSV
import pandas as pd # file operations
from bs4 import BeautifulSoup as soup #Scraping tool
from urllib.request import urlopen as ureq # For requesting data from link
import numpy as np
import re
urls = ["https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-royal-challengers-bangalore-42nd-match-734013/full-scorecard",
"https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-kolkata-knight-riders-21st-match-733971/full-scorecard"]
for url in urls:
url_contents = ureq(url) #opening the URL
soup = soup(url_contents,"html.parser") #parse the
**batsmen = soup.find_all("table", { "class":["table batsman"]})** #error here
bowlers = soup.find_all("table", { "class":["table bowler"]})
for batsman in batsmen[0]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (batsmen)
for x in batsman:
rows = batsman.find_all('tr')[:-2] #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
for bowler in bowlers[1]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (bowlers)
for x in bowler:
rows = bowler.find_all('tr') #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
Your problem is because you use the same name soup for class/function soup(...) and for result soup = ... - and you run it in loop.
from bs4 import BeautifulSoup as soup
for url in urls:
soup = soup(...)
In first loop all work correctly but class/function soup() is replaces by result soup = ... and in next loop it tries to use result soup as a class/function - and this makes problem.
In first code you run soup = soup() only once so it makes no problem.
If you use different names - ie. BeautifoulSoup instead of soup - then it will work
from bs4 import BeautifulSoup
for url in urls:
soup = BeautifulSoup(...)
BTW:
In second code you have wrong indentations - you should run for batsman in ... and for bowler in ... inside for url in urls: but you run it outside (after exiting from loop for url in urls:) and this will give you results only for last url
You can use request lib and try this
import requests as req
from bs4 import BeautifulSoup
urls = ["https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-royal-challengers-bangalore-42nd-match-734013/full-scorecard",
"https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-kolkata-knight-riders-21st-match-733971/full-scorecard"]
for url in urls:
otp = req.get(url)
if otp.ok:
soup = BeautifulSoup(otp.text,'lxml')
batsmen = soup.find_all('table', {'class': 'table batsman'})
bowlers = soup.find_all('table', {'class': 'table bowler'})
for bat in batsmen:
print(bat.find_all('td')) # here you can use find/find_all method
for bowl in bowlers:
print(bowl.find_all('td')) # here you can use find/find_all method

Need help in scraping information from multiple webpages and import to csv file in tabular form - Python

I have been working on webscraping the infobox information on Wikipedia. This is the following code that I have been using:
import requests
import csv
from bs4 import BeautifulSoup
URL = ['https://en.wikipedia.org/wiki/Workers_Credit_Union','https://en.wikipedia.org/wiki/San_Diego_County_Credit_Union',
'https://en.wikipedia.org/wiki/USA_Federal_Credit_Union','https://en.wikipedia.org/wiki/Commonwealth_Credit_Union',
'https://en.wikipedia.org/wiki/Center_for_Community_Self-Help','https://en.wikipedia.org/wiki/ESL_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/State_Employees_Credit_Union','https://en.wikipedia.org/wiki/United_Heritage_Credit_Union']
for url in URL:
headers=[]
rows=[]
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',class_ ='infobox')
credit_union_name= soup.find('h1', id = "firstHeading")
header_tags = table.find_all('th')
headers = [header.text.strip() for header in header_tags]
data_rows = table.find_all('tr')
for row in data_rows:
value = row.find_all('td')
beautified_value = [dp.text.strip() for dp in value]
if len(beautified_value) == 0:
continue
rows.append(beautified_value)
rows.append("")
rows.append([credit_union_name.text.strip()])
rows.append([url])
with open(r'credit_unions.csv','a+',newline="") as output:
writer=csv.writer(output)
writer.writerow(headers)
writer.writerow(rows)
However, I checked the csv file and information is not being presented in tabular form. The scraped elements are being stored in nested lists instead of a singular list. I need the scraped information of each URL to be stored in a singular list and print the list in csv file in tabular form with the headings. Need help regarding this.
The infoboxes have different structures and labels. So I think the best way to solve this is to use dicts and a DictWriter.
import requests
import csv
from bs4 import BeautifulSoup
URL = ['https://en.wikipedia.org/wiki/Workers_Credit_Union',
'https://en.wikipedia.org/wiki/San_Diego_County_Credit_Union',
'https://en.wikipedia.org/wiki/USA_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/Commonwealth_Credit_Union',
'https://en.wikipedia.org/wiki/Center_for_Community_Self-Help',
'https://en.wikipedia.org/wiki/ESL_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/State_Employees_Credit_Union',
'https://en.wikipedia.org/wiki/United_Heritage_Credit_Union']
csv_headers = set()
csv_rows = []
for url in URL:
csv_row = {}
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
credit_union_name = soup.find('h1', id="firstHeading")
table = soup.find('table', class_='infobox')
data_rows = table.find_all('tr')
for data_row in data_rows:
label = data_row.find('th')
value = data_row.find('td')
if label is None or value is None:
continue
beautified_label = label.text.strip()
beautified_value = value.text.strip()
csv_row[beautified_label] = beautified_value
csv_headers.add(beautified_label)
csv_row["name"] = credit_union_name.text.strip()
csv_row["url"] = url
csv_rows.append(csv_row)
with open(r'credit_unions.csv', 'a+', newline="") as output:
headers = ["name", "url"]
headers += sorted(csv_headers)
writer = csv.DictWriter(output, fieldnames=headers)
writer.writeheader()
writer.writerows(csv_rows)

Python Web Scraping - How to scrape this type of site?

Okay, so I need to scrape the following webpage: https://www.programmableweb.com/category/all/apis?deadpool=1
It's a list of APIs. There are approx 22,000 APIs to scrape.
I need to:
1) Get the URL of each API in the table (pages 1-889), and also to scrape the following info:
API name
Description
Category
Submitted
2) I then need to scrape a bunch of information from each URL.
3) Export the data to a CSV
The thing is, I’m a bit lost of how to think about this project. From what I can see, there are no AJAX calls been made to populate the table, which means I’m going to have to parse the HTML directly (right?)
In my head, the logic would be something like this:
Use the requests & BS4 libraries to scrape the table
Then, somehow grab the HREF from every row
Access that HREF, scrape the data, move onto the next one
Rinse and repeat for all table rows.
Am I on the right track, is this possible with requests & BS4?
Here's are some screenshots of what I've been trying to explain.
Thank you SOO much for any help. This is hurting my head haha
Here we go using requests, BeautifulSoup and pandas:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.programmableweb.com/category/all/apis?deadpool=1&page='
num = int(input('How Many Page to Parse?> '))
print('please wait....')
name = []
desc = []
cat = []
sub = []
for i in range(0, num):
r = requests.get(f"{url}{i}")
soup = BeautifulSoup(r.text, 'html.parser')
for item1 in soup.findAll('td', attrs={'class': 'views-field views-field-title col-md-3'}):
name.append(item1.text)
for item2 in soup.findAll('td', attrs={'class': 'views-field views-field-search-api-excerpt views-field-field-api-description hidden-xs visible-md visible-sm col-md-8'}):
desc.append(item2.text)
for item3 in soup.findAll('td', attrs={'class': 'views-field views-field-field-article-primary-category'}):
cat.append(item3.text)
for item4 in soup.findAll('td', attrs={'class': 'views-field views-field-created'}):
sub.append(item4.text)
result = []
for item in zip(name, desc, cat, sub):
result.append(item)
df = pd.DataFrame(
result, columns=['API Name', 'Description', 'Category', 'Submitted'])
df.to_csv('output.csv')
print('Task Completed, Result saved to output.csv file.')
Result can be viewed online: Check Here
Output Simple:
Now For href parsing:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.programmableweb.com/category/all/apis?deadpool=0&page='
num = int(input('How Many Page to Parse?> '))
print('please wait....')
links = []
for i in range(0, num):
r = requests.get(f"{url}{i}")
soup = BeautifulSoup(r.text, 'html.parser')
for link in soup.findAll('td', attrs={'class': 'views-field views-field-title col-md-3'}):
for href in link.findAll('a'):
result = 'https://www.programmableweb.com'+href.get('href')
links.append(result)
spans = []
for link in links:
r = requests.get(link)
soup = soup = BeautifulSoup(r.text, 'html.parser')
span = [span.text for span in soup.select('div.field span')]
spans.append(span)
data = []
for item in spans:
data.append(item)
df = pd.DataFrame(data)
df.to_csv('data.csv')
print('Task Completed, Result saved to data.csv file.')
Check Result Online: Here
Sample View is Below:
In Case if you want those 2 csv files together so here's the code:
import pandas as pd
a = pd.read_csv("output.csv")
b = pd.read_csv("data.csv")
merged = a.merge(b)
merged.to_csv("final.csv", index=False)
Online Result: Here
You should read more about scraping if you are going to pursue it .
from bs4 import BeautifulSoup
import csv , os , requests
from urllib import parse
def SaveAsCsv(list_of_rows):
try:
with open('data.csv', mode='a', newline='', encoding='utf-8') as outfile:
csv.writer(outfile).writerow(list_of_rows)
except PermissionError:
print("Please make sure data.csv is closed\n")
if os.path.isfile('data.csv') and os.access('data.csv', os.R_OK):
print("File data.csv Already exists \n")
else:
SaveAsCsv([ 'api_name','api_link','api_desc','api_cat'])
BaseUrl = 'https://www.programmableweb.com/category/all/apis?deadpool=1&page={}'
for i in range(1, 890):
print('## Getting Page {} out of 889'.format(i))
url = BaseUrl.format(i)
res = requests.get(url)
soup = BeautifulSoup(res.text,'html.parser')
table_rows = soup.select('div.view-content > table[class="views-table cols-4 table"] > tbody tr')
for row in table_rows:
tds = row.select('td')
api_name = tds[0].text.strip()
api_link = parse.urljoin(url, tds[0].find('a').get('href'))
api_desc = tds[1].text.strip()
api_cat = tds[2].text.strip() if len(tds) >= 3 else ''
SaveAsCsv([api_name,api_link,api_desc,api_cat])

Categories

Resources