Extract table from CSV in Python with BS - python

I have a huge HTML file which contains many tables and I want to save it as CSV file.
I have a code to export all tables from an html file but when I run it, nothing happens! Is there anyway to troubleshoot this? I am using BS and panda.
It just opens the file and looks for the table.
import os
import sys
import pandas as pd
from bs4 import BeautifulSoup as bs
path = 'index.html'
soup = bs(open(path),'html.parser')
def get_all_tables(soup):
"""Extracts and returns all tables in a soup object"""
return soup.find_all("table")
def get_table_headers(table):
"""Given a table soup, returns all the headers"""
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
return headers
def get_table_rows(table):
"""Given a table, returns all its rows"""
rows = []
for tr in table.find_all("tr")[1:]:
cells = []
# grab all td tags in this table row
tds = tr.find_all("td")
if len(tds) == 0:
# if no td tags, search for th tags
# can be found especially in wikipedia tables below the table
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
# use regular td tags
for td in tds:
cells.append(td.text.strip())
rows.append(cells)
return rows
def save_as_csv(table_name, headers, rows):
pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")
def main(path):
# get the soup
soup = get_soup(path)
# extract all the tables from the file
tables = get_all_tables(soup)
print(f"[+] Found a total of {len(tables)} tables.")
# iterate over all tables
for i, table in enumerate(tables, start=1):
# get the table headers
headers = get_table_headers(table)
# get all the rows of the table
rows = get_table_rows(table)
# save table as csv file
table_name = f"table-{i}"
print(f"[+] Saving {table_name}")
save_as_csv(table_name, headers, rows)

Related

Beautiful Soup to Scrape Data from Static Webpages

I am trying to values from a table of multiple static webpages. It is the verb conjugation data for Korean verbs here: https://koreanverb.app/
My Python script uses Beautiful Soup. The goal is to grab all conjugations from multiple URL inputs and output the data to a CSV file.
Conjugations are stored on the page in table with class "table-responsive" and under the table rows with class "conjugation-row". There are multiple "conjugation-row" table rows on each page. My script is someone only grabbing the first table row with class "conjugation-row".
Why isn't the for loop grabbing all the td elements with class "conjugation-row"? I would appreciate a solution that grabs all tr with class "conjugation-row". I tried using job_elements = results.find("tr", class_="conjugation-row"), but I get the following error:
AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Furthermore, when I do get the data and output to a CSV file, the data is in separate rows as expected, but leaves empty spaces., It places the data rows for the second URL at the index after all data rows for the first URL. See example output here:
See code here:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
# create csv file
outfile = open("scrape.csv","w",newline='')
writer = csv.writer(outfile)
## define first URL to grab conjugation names
url1 = 'https://koreanverb.app/?search=%ED%95%98%EB%8B%A4'
# define dataframe columns
df = pd.DataFrame(columns=['conjugation name'])
# get URL content
response = requests.get(url1)
soup = BeautifulSoup(response.content, 'html.parser')
# get table with all verb conjugations
results = soup.find("div", class_="table-responsive")
##### GET CONJUGATIONS AND APPEND TO CSV
# define URLs
urls = ['https://koreanverb.app/?search=%ED%95%98%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A8%B9%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A7%88%EC%8B%9C%EB%8B%A4']
# loop to get data
for url in urls:
response = requests.get(url)
soup2 = BeautifulSoup(response.content, 'html.parser')
# get table with all verb conjugations
results2 = soup2.find("div", class_="table-responsive")
# get dictionary form of verb/adjective
verb_results = soup2.find('dl', class_='dl-horizontal')
verb_title = verb_results.find('dd')
verb_title_text = verb_title.text
job_elements = results2.find_all("tr", class_="conjugation-row")
for job_element in job_elements:
conjugation_name = job_element.find("td", class_="conjugation-name")
conjugation_korean = conjugation_name.find_next_sibling("td")
conjugation_name_text = conjugation_name.text
conjugation_korean_text = conjugation_korean.text
data_column = pd.DataFrame({ 'conjugation name': [conjugation_name_text],
verb_title_text: [conjugation_korean_text],
})
#data_column = pd.DataFrame({verb_title_text: [conjugation_korean_text]})
df = df.append(data_column, ignore_index = True)
# save to csv
df.to_csv('scrape.csv')
outfile.close()
print('Verb Conjugations Collected and Appended to CSV, one per column')
Get all the job_elements using find_all() since find() only returns the first occurrence and iterate over them in a for loop like below.
job_elements = results.find_all("tr", class_="conjugation-row")
for job_element in job_elements:
conjugation_name = job_element.find("td", class_="conjugation-name")
conjugation_korean = conjugation_name.find_next_sibling("td")
conjugation_name_text = conjugation_name.text
conjugation_korean_text = conjugation_korean.text
# append element to data
df2 = pd.DataFrame([[conjugation_name_text,conjugation_korean_text]],columns=['conjugation_name','conjugation_korean'])
df = df.append(df2)
The error is where you are trying to use find() on a variable of type list.
As your script is growing big, I made some modifications like using get_conjugations() function and some proper names that are easy to understand. Firstly, conjugation_names and conjugation_korean_names are added into pandas Dataframe columns and then other columns are added subsequently (korean0, korean1 ...).
import requests
from bs4 import BeautifulSoup
import pandas as pd
# function to parse the html data & get conjugations
def get_conjugations(url):
#set return lists
conjugation_names = []
conjugation_korean_names = []
#get html text
html = requests.get(url).text
#parse the html text
soup = BeautifulSoup(html, 'html.parser')
#get table
table = soup.find("div", class_="table-responsive")
table_rows = table.find_all("tr", class_="conjugation-row")
for row in table_rows:
conjugation_name = row.find("td", class_="conjugation-name")
conjugation_korean = conjugation_name.find_next_sibling("td")
conjugation_names.append(conjugation_name.text)
conjugation_korean_names.append(conjugation_korean.text)
#return both lists
return conjugation_names, conjugation_korean_names
# create csv file
outfile = open("scrape.csv", "w", newline='')
urls = ['https://koreanverb.app/?search=%ED%95%98%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A8%B9%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A7%88%EC%8B%9C%EB%8B%A4']
# define dataframe columns
df = pd.DataFrame(columns=['conjugation_name', 'conjugation_korean', 'korean0', 'korean1'])
conjugation_names, conjugation_korean_names = get_conjugations(urls[0])
df['conjugation_name'] = conjugation_names
df['conjugation_korean'] = conjugation_korean_names
for index, url in enumerate(urls[1:]):
conjugation_names, conjugation_korean_names = get_conjugations(url)
#set column name
column_name = 'korean' + str(index)
df[column_name] = conjugation_korean_names
#save to csv
df.to_csv('scrape.csv')
outfile.close()
# Print DONE
print('Export to CSV Complete')
Output:
,conjugation_name,conjugation_korean,korean0,korean1
0,declarative present informal low,해,먹어,마셔
1,declarative present informal high,해요,먹어요,마셔요
2,declarative present formal low,한다,먹는다,마신다
3,declarative present formal high,합니다,먹습니다,마십니다
...
Note:
This assumes that elements in different URLs are in same order.

Python BeautifulSoup and Pandas extract table from list of urls and save all the tables into single dataframe or save as csv

I am trying to extract tabular data from a list of urls and I want to save all the table into a single csv file.
I am new and relatively beginner in python and from non-CS background, however I am very eager to learn the same.
import pandas as pd
import urllib.request
import bs4 as bs
urls = ['A', 'B','C','D',...'Z']
for url in urls:
source = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(source,'lxml')
table = soup.find('table', class_='tbldata14 bdrtpg')
table_rows = table.find_all('tr')
data = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text for tr in td]
data.append(row)
final_table = pd.DataFrame(data, columns=["ABC", "XYZ",...])
final_table.to_csv (r'F:\Projects\McData.csv', index = False, header=True)
What I Get from above code in newly created csv file is -
ABC XYZ PQR MNL CYP ZXS
1 2 3 4 5 6
My above code only gets table from last url- 'Z', which, as I have checked is actually the table from last url in list.
What I am trying to achieve here is getting all tables from list of urls - i.e. A to Z into single csv file.
This is an issue with indentation and order. table_rows gets reset every time through the for url in urls loop, so you only end up with the last URLs worth of data. If you want all of the URLs worth of data in one final CSV, see the changes I made below.
import pandas as pd
import urllib.request
import bs4 as bs
urls = ['A', 'B','C','D',...'Z']
data = [] # Moved to the start
for url in urls:
source = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(source,'lxml')
table = soup.find('table', class_='tbldata14 bdrtpg')
table_rows = table.find_all('tr')
#indented the following loop so it runs with every URL data
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text for tr in td]
data.append(row)
final_table = pd.DataFrame(data, columns=["ABC", "XYZ",...])
final_table.to_csv (r'F:\Projects\McData.csv', index = False, header=True)

Python Web Scraping: Output to csv

I'm doing some progress with web scraping however I still need some help to perform some operations:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
# soup = BeautifulSoup(requests.get(converturl).content, 'html.parser')
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
out = []
for tr in soup.select('.col-md-4 tbody tr'):
On the class col-md-4 I know there are 3 tables I want to generate a csv which as an output has three values: first name, last name, and for the last value I want the header name of the table.
first name, last name, header table
Any help would be appreciated.
This is what I have done on my own:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
filename = url.rsplit('/', 1)[1] + '.csv'
tables = soup.select('.col-md-4 table')
rows = []
for tr in tables:
t = tr.get_text(strip=True, separator='|').split('|')
rows.append(t)
df = pd.DataFrame(rows)
print(df)
df.to_csv(filename)
Thanks,
This might work:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
tables = soup.select('.col-md-4 table')
rows = []
for table in tables:
cleaned = list(table.stripped_strings)
header, names = cleaned[0], cleaned[1:]
data = [name.split(', ') + [header] for name in names]
rows.extend(data)
result = pd.DataFrame.from_records(rows, columns=['surname', 'name', 'table'])
You need to first iterate through each table you want to scrape, then for each table, get its header and rows of data. For each row of data, you want to parse out the First Name and Last Name (along with the header of the table).
Here's a verbose working example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
out = []
# Iterate through each of the three tables
for table in soup.select(".col-md-4 table"):
# Grab the header and rows from the table
header = table.select("thead th")[0].text.strip()
rows = [s.text.strip() for s in table.select("tbody tr")]
t = [] # This list will contain the rows of data for this table
# Iterate through rows in this table
for row in rows:
# Split by comma (last_name, first_name)
split = row.split(",")
last_name = split[0].strip()
first_name = split[1].strip()
# Create the row of data
t.append([first_name, last_name, header])
# Convert list of rows to a DataFrame
df = pd.DataFrame(t, columns=["first_name", "last_name", "table_name"])
# Append to list of DataFrames
out.append(df)
# Write to CSVs...
out[0].to_csv("first_table.csv", index=None) # etc...
Whenever you're web scraping, I highly recommend using strip() on all of the text you parse to make sure you don't have superfluous spaces in your data.
I hope this helps!

Scraping data from a table and storing it in csv file

I would want to scrap the data from this website and store it in csv file in this manner.
But when I try to scrap the data it is not stored in exact format. All the data is stored in the 1st column itself. I have no idea how to approach this problem.
Link : https://pce.ac.in/students/bachelors-students/
Code:
import csv # file operations
from bs4 import BeautifulSoup as soup # lib for pulling data from html/xmlsites
from urllib.request import urlopen as uReq # lib for sending and rec info over http
Url = 'https://pce.ac.in/students/bachelors-students/'
pageHtml = uReq(Url)
soup = soup(pageHtml,"html.parser") #parse the html
table = soup.find_all("table", { "class" : "tablepress tablepress-id-10 tablepress-responsive-phone" })
f = csv.writer(open('BEPillaiDepart.csv', 'w'))
f.writerow(['Choice Code', 'Course Name', 'Year of Establishment','Sanctioned Strength']) # headers
for x in table:
data=""
table_body = x.find('tbody') #find tbody tag
rows = table_body.find_all('tr') #find all tr tag
for tr in rows:
cols = tr.find_all('td') #find all td tags
for td in cols:
data=data+ "\n"+ td.text.strip()
f.writerow([data])
#print(data)
Create variable data in each tr label,you can try like this:
import csv # file operations
from bs4 import BeautifulSoup as soup # lib for pulling data from html/xmlsites
from urllib.request import urlopen as uReq # lib for sending and rec info over http
Url = 'https://pce.ac.in/students/bachelors-students/'
pageHtml = uReq(Url)
soup = soup(pageHtml,"html.parser") #parse the html
table = soup.find_all("table", { "class" : "tablepress tablepress-id-10 tablepress-responsive-phone" })
with open('BEPillaiDepart.csv', 'w',newline='') as csvfile:
f = csv.writer(csvfile)
f.writerow(['Choice Code', 'Course Name', 'Year of Establishment','Sanctioned Strength']) # headers
for x in table:
table_body = x.find('tbody') #find tbody tag
rows = table_body.find_all('tr') #find all tr tag
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
If you search the meaning of csv, you would find it means comma separated values, however I don't see any commas in your text while appending it to the file.

Python Data Scraper

I wrote the following line of code
#!/usr/bin/python
#weather.scraper
from bs4 import BeautifulSoup
import urllib
def main():
"""weather scraper"""
r = urllib.urlopen("https://www.wunderground.com/history/airport/KPHL/2016/1/1/MonthlyHistory.html?&reqdb.zip=&reqdb.magic=&reqdb.wmo=&MR=1").read()
soup = BeautifulSoup(r, "html.parser")
table = soup.find_all("table", class_="responsive airport-history-summary-table")
tr = soup.find_all("tr")
td = soup.find_all("td")
print table
if __name__ == "__main__":
main()
When I print the table i get all the html (td, tr, span, etc.) as well. How can I print the content of the table (tr, td) without the html?
THANKS!
You have to use .getText() method when you want to get a content. Since find_all returns a list of elements, you have to choose one of them (td[0]).
Or you can do for example:
for tr in soup.find_all("tr"):
print '>>>> NEW row <<<<'
print '|'.join([x.getText() for x in tr.find_all('td')])
The loop above prints for each row cell next to cell.
Note that you do find all td's and all tr's your way but you probably want to get just those in table.
If you want to look for elements inside the table, you have to do this:
table.find('tr') instead of soup.find('tr) so the BeautifulSoup will be looking for trs in the table instead of whole html.
YOUR CODE MODIFIED (according to your comment that there are more tables):
#!/usr/bin/python
#weather.scraper
from bs4 import BeautifulSoup
import urllib
def main():
"""weather scraper"""
r = urllib.urlopen("https://www.wunderground.com/history/airport/KPHL/2016/1/1/MonthlyHistory.html?&reqdb.zip=&reqdb.magic=&reqdb.wmo=&MR=1").read()
soup = BeautifulSoup(r, "html.parser")
tables = soup.find_all("table")
for table in tables:
print '>>>>>>> NEW TABLE <<<<<<<<<'
trs = table.find_all("tr")
for tr in trs:
# for each row of current table, write it using | between cells
print '|'.join([x.get_text().replace('\n','') for x in tr.find_all('td')])
if __name__ == "__main__":
main()

Categories

Resources