I'm trying to scrape a table on the markets.ft website which unfortunately has a number of icons in it (table: 'Lipper Leader Scorecard' - https://markets.ft.com/data/funds/tearsheet/ratings?s=LU0526609390:EUR).
When I use BeautifulSoup, I can grab the table but all the values are NaN.
Is there a way to scrape the icons inside the table and convert them into a numerical number?
My code is:
import requests
import pandas as pd
from bs4 import BeautifulSoup
id_list = ['LU0526609390:EUR','IE00BHBX0Z19:EUR', 'LU1076093779:EUR', 'LU1116896363:EUR', 'LU1116896876:EUR']
urls = ['https://markets.ft.com/data/funds/tearsheet/ratings?s='+ x for x in id_list]
dfs =[]
for url in urls:
r = requests.get(url).content
soup = BeautifulSoup(r, 'html.parser')
# Some funds in the list do not have any data.
try:
table = soup.find_all('table')[0]
print(table)
except Exception:
continue
df = pd.read_html(str(table), index_col=0)[0]
dfs.append(df)
print(dfs)
Required Output for fund (LU0526609390):
Total return Consistent return Preservation Expense
Overall rating 3 3 5 5
3 year rating 3 3 5 5
5 year rating 2 3 5 5
You can use a dictionary to transform the class value to the corresponding integer
import requests, bs4
import pandas as pd
from io import StringIO
options = {
'mod-sprite-lipper-1': 1,
'mod-sprite-lipper-2': 2,
'mod-sprite-lipper-3': 3,
'mod-sprite-lipper-4': 4,
'mod-sprite-lipper-5': 5,
}
soup = bs4.BeautifulSoup(requests.get(
url= 'https://markets.ft.com/data/funds/tearsheet/ratings?s=LU0526609390:EUR'
).content, 'html.parser').find('table', {'class': 'mod-ui-table'})
header = [x.text.strip() for x in soup.find('thead').find_all('th')]
data = [header] + [
[x.find('td').text.strip()] + [
options[e.find('i') .get('class')[-1]]
for e in x.find_all('td')[1:]
]
for x in soup.find('tbody').find_all('tr')
]
df = pd.read_csv(
StringIO('\n'.join([','.join(str(x) for x in xs) for xs in data])),
index_col = 0,
)
print(df)
Related
I'm trying to parse this webpage into a pandas dataframe to analyze, but the page is set up such that the table only has two columns of use, one with the name and the other containing all the other information as a single cell.
For example, with my code below:
import bs4
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
url = "https://education.scripps.edu/alumni/graduate-alumni-list/index.html"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
table = soup.find('tbody')
td = table.find_all('td')
data = []
for element in td:
sub_data = []
for sub_element in element:
try:
sub_data.append(sub_element.get_text())
except:
continue
data.append(sub_data)
dataFrame = pd.DataFrame(data = data)
df = dataFrame[[1,3]]
df = df.dropna()
So df.iat[0,1] would have the program, defense year, advisor, dissertation title, and undergraduate institution. The HTML only uses "br" and "strong" to separate these values, and I am wondering if there is any way to separate this text into different columns so the columns would be "name", "program", "defense year" and such, instead of one cell containing all the information.
thank you so much!
After try: and before sub_data.append line in your code you should split your sub_element text by "<br>". You can try the following:
sub_data_splitted = sub_element.get_text().split("<br>").
# After that you are able to use each field of the data i.e.
program = sub_data_splitted[0].split(":")[1]
defense_year = sub_data_splitted[1].split(":")[1]
advisor = sub_data_splitted[2].split(":")[1]
dissertation_title = sub_data_splitted[3].split(":")[1]
ug_institution = sub_data_splitted[4].split(":")[1]
You can do like this.
You can use .stripped_strings() to get a list of data from each <tr> of the table.
Since you only need the values and not the titles (like Name, Defense Year, etc.) Use List comprehension to select only the required values.
Append the list to a dataframe.
Here is how it is done.
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = "https://education.scripps.edu/alumni/graduate-alumni-list/index.html"
page = requests.get(URL)
soup = BeautifulSoup(page.text, "lxml")
t = soup.find('table').find('tbody')
trs = t.find_all('tr')
data = []
for i in trs:
l = [x for i,x in enumerate(list(i.stripped_strings)) if i%2 == 0]
data.append(l)
df = pd.DataFrame(data=data)
0 ... 6
0 Abbott, PhD, Jason ... None
1 Adam, PhD, Gregory Charles ... None
2 Adhikari, PhD, Pramisha ... None
3 Al-Bassam, PhD, Jawdat M. H. ... None
4 Albertshofer, PhD, Klaus ... None
.. ... ... ...
682 Zhou, PhD, Jiacheng ... None
683 Zhou, PhD, Zhaohui (Sunny) ... None
684 Zhu, PhD, Ruyi ... None
685 Zhu, PhD, Yan ... None
686 Zuhl, PhD, Andrea M. ... None
[687 rows x 7 columns]
Is this what you're trying to do?
import bs4
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
url = "https://education.scripps.edu/alumni/graduate-alumni-list/index.html"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
table = soup.find('tbody')
td = table.find_all('td')
data = {}
names = []
prev_name = None
for element in td:
sub_data = {}
for sub_element in element:
try:
data[sub_element['alt']] = {}
prev_name = sub_element['alt']
except:
sub_element = str(sub_element).replace('</strong>', '').replace('<br/>', '</strong>')
temp = BeautifulSoup(sub_element)
if len(temp.find_all('strong')) > 0:
temp = [str(i.string) for i in temp.find_all('strong') if i.string is not None]
temp = {i.split(':', 1)[0] : i.split(':', 1)[1] for i in temp if ':' in i}
data[prev_name] = temp
df = pd.DataFrame(data = data)
df = df.T.reset_index()
df.rename(columns={'index' : 'Name'}, inplace=True)
I have scraped some updated day-by-day data (only numbers). I want to show them in a good table (data frame). I don't know how to use Pandas. I am using python and the end result should look like a table with defined keys on it. Thanks
And here is my python code:
import requests
from bs4 import BeautifulSoup
url = 'https://www.worldometers.info/coronavirus/country/Austria/'
page = requests.get(url)
soup = BeautifulSoup(page.text , 'html.parser')
#RECOVERD , DEATHS AND TOTAL CASES
Covid_Cases_Array = []
get_Covid_Cases = soup.find_all(class_ = 'maincounter-number')
for item in get_Covid_Cases:
Covid_Cases_Array.append(item.text)
print(item.text)
# Active ND CLOSED DATA
Covid_Active_Closed = []
get_Activ_Closed = soup.find_all(class_ = 'number-table-main')
for item in get_Activ_Closed:
Covid_Active_Closed.append(item.text)
print(item.text)
And the result of that code :
600,089
9,997
563,256
26,836
573,253
You can use this example how to get the data from the page to DataFrame:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.worldometers.info/coronavirus/country/Austria/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
cases, deaths, recovered = soup.select(".maincounter-number")
active_cases, closed_cases = soup.select(".number-table-main")
active_cases_mild, active_cases_serious, _, _ = soup.select(".number-table")
df = pd.DataFrame(
{
"Coronavirus Cases": [cases.get_text(strip=True)],
"Deaths": [deaths.get_text(strip=True)],
"Recovered": [recovered.get_text(strip=True)],
"Currently infected": [active_cases.get_text(strip=True)],
"Closed cases": [closed_cases.get_text(strip=True)],
"Active cases (mild)": [active_cases_mild.get_text(strip=True)],
"Active cases (serious)": [active_cases_serious.get_text(strip=True)],
}
)
print(df)
Prints:
Coronavirus Cases Deaths Recovered Currently infected Closed cases Active cases (mild) Active cases (serious)
0 600,089 9,997 563,256 26,836 573,253 26,279 557
I am trying to get second table elements for a list of links and store them as a pandas dataframe, to accomplish this task I defined a function getCitySalaryTable():
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
job_title_urls=['https://www.salario.com.br/profissao/abacaxicultor-cbo-612510',
'https://www.salario.com.br/profissao/abade-cbo-263105']
def getCitySalaryTable(job_title_urls, city_salary_df):
for url in job_title_urls:
original_url= url
url = requests.get(url)
soup=BeautifulSoup(url.text, 'lxml')
tables=soup.find_all('table', attrs={'class':'listas'})
# I suspect the problem is here #
city_salary_table=tables[1]
#################################
# extracting column names
heads= city_salary_table.find('thead').find('tr').find_all('th')
colnames = [hdr.text for hdr in heads]
# extracting rows
data = {k:[] for k in colnames}
rows = city_salary_table.find('tbody').find_all('tr')
for rw in rows:
for col in colnames:
cell = rw.find('td', attrs={'data-label':'{}'.format(col)})
data[col].append(cell.text)
#print(data)
# Constructing a pandas dataframe using the data just parsed
"""
adding keys: cbo, job_title
"""
cbo = original_url.split('/')[-1].split('-')[-1]
job_title = original_url.split('/')[-1].split('-')[0]
df = pd.DataFrame.from_dict(data)
df.insert(0,'cbo','')
df['cbo'] = cbo
df.insert(1, 'job_title', '')
df['job_title'] = job_title
city_salary_df = pd.concat([city_salary_df, df], ignore_index=True)
return city_salary_df
However when applied:
city_salary_df = pd.DataFrame()
city_salary_df = getCitySalaryTable(job_title_urls, city_salary_df)
It returns a dataframe just for the first link, I suspect that the index in the function (city_salary_table=tables[1]) is not correct for other links.
# cbo job_title ... Salário/Hora Total
#0 612510 abacaxicultor ... 6,16 29
#1 612510 abacaxicultor ... 5,96 6
#2 612510 abacaxicultor ... 6,03 4
#3 612510 abacaxicultor ... 16,02 4
#4 612510 abacaxicultor ... 4,75 3
#5 612510 abacaxicultor ... 5,13 3
#[6 rows x 9 columns]
How could I properly tell the function to return me just the second table for all links?
Use nth-of-type if it is truly the 2nd table
soup.select_one('table:nth-of-type(2)')
Though a class selector is faster than type selector
soup.select_one('.listas:nth-of-type(2)')
import request
from bs4 import BeautifulSoup as bs
soup = bs(requests.get('https://www.salario.com.br/profissao/abacaxicultor-cbo-612510').text, 'lxml')
soup.select_one('.listas:nth-of-type(2)')
Your last link doesn't have that table so add a check on whether city_salary_table is None:
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
job_title_urls=['https://www.salario.com.br/profissao/abacaxicultor-cbo-612510',
'https://www.salario.com.br/profissao/abade-cbo-263105',
'https://www.salario.com.br/profissao/abadessa-cbo-263105',
'https://www.salario.com.br/profissao/abanador-na-agricultura-cbo-622020']
def getCitySalaryTable(job_title_urls, city_salary_df):
for url in job_title_urls:
r = requests.get(url)
print()
soup=BeautifulSoup(r.text, 'lxml')
# I suspect the problem is here #
city_salary_table = soup.select_one('.listas:nth-of-type(2)')
#################################
if city_salary_table is not None:
# extracting column names
heads= city_salary_table.find('thead').find('tr').find_all('th')
colnames = [hdr.text for hdr in heads]
# extracting rows
data = {k:[] for k in colnames}
rows = city_salary_table.find('tbody').find_all('tr')
for rw in rows:
for col in colnames:
cell = rw.find('td', attrs={'data-label':'{}'.format(col)})
data[col].append(cell.text)
#print(data)
# Constructing a pandas dataframe using the data just parsed
"""
adding keys: cbo, job_title
"""
cbo = url.split('/')[-1].split('-')[-1]
job_title = url.split('/')[-1].split('-')[0]
df = pd.DataFrame.from_dict(data)
df.insert(0,'cbo','')
df['cbo'] = cbo
df.insert(1, 'job_title', '')
df['job_title'] = job_title
city_salary_df = pd.concat([city_salary_df, df], ignore_index=True)
return city_salary_df
city_salary_df = pd.DataFrame()
city_salary_df = getCitySalaryTable(job_title_urls, city_salary_df)
print(city_salary_df)
Google colab:
I think that Google Colab is using an ancient version of soupsieve and we are not seeing the not implemented error being reported for nth-of-type. Instead, you can use city_salary_table = soup.select_one('table + table')
I am scraping a website table form https://csr.gov.in/companyprofile.php?year=FY+2015-16&CIN=L00000CH1990PLC010573 but I am not getting the exact result I am looking for. I want 11 columns from this link, "company name", "Class", "State", "Company Type", "RoC", "Sub Category", "Listing Status". These are 7 columns and after that you can see an expand button " CSR Details of FY 2017-18" when you will click on that button you will get 4 more columns "Average Net Profit", "CSR Prescribed Expenditure", "CSR Spent", "Local Area Spent". I want all these columns in csv file. I wrote a code and it is not working properly. I am attaching an Image of result for refference. and here is my code. please help to get these data.
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import csv
driver = webdriver.Chrome()
url_file = "csrdata.txt"
with open(url_file, "r") as url:
url_pages = url.read()
# we need to split each urls into lists to make it iterable
pages = url_pages.split("\n") # Split by lines using \n
data = []
# now we run a for loop to visit the urls one by one
for single_page in pages:
driver.get(single_page)
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
driver.find_element_by_link_text("CSR Details of FY 2017-18").click()
table = driver.find_elements_by_xpath("//*[contains(#id,'colfy4')]")
about = table.__getitem__(0).text
x = about.split('\n')
print(x)
data.append(x)
df = pd.DataFrame(data)
print(df)
# write to csv
df.to_csv('csr.csv')
You dont need to use selenium since all the informations are inside the html code. Also you can use pandas inbuild function pd_read_html() to directly transform the html-table into a dataframe.
data = []
for single_page in pages:
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
table = soup.find_all('table') #finds all tables
table_top = pd.read_html(str(table))[0] #the top table
try: #try to get the other table if exists
table_extra = pd.read_html(str(table))[7]
except:
table_extra = pd.DataFrame()
result = pd.concat([table_top, table_extra])
data.append(result)
pd.concat(data).to_csv('test.csv')
output:
0 1
0 Class Public
1 State Chandigarh
2 Company Type Other than Govt.
3 RoC RoC-Chandigarh
4 Sub Category Company limited by shares
5 Listing Status Listed
0 Average Net Profit 0
1 CSR Prescribed Expenditure 0
2 CSR Spent 0
3 Local Area Spent 0
I want to save data from a table, which has been scraped from a website, into a SQLite Database. Here is what I was able to do so far. Preferrably i want to save a value into a variably and later load it into the database.
import requests
from lxml import etree
from bs4 import BeautifulSoup
response = requests.get("https://www.boerse.de/historische-kurse/Daimler-Aktie/DE0007100000")
# storing content of page
src = response.content
# create BeatifulSoup Object based on src
soup = BeautifulSoup(src, 'lxml')
tables = soup.find_all("tr")
"""for table in tables:
if "17.03.20" in table.text:
table = table.text
table = etree.HTML(table)
rows = iter(table)
for row in rows:
values = [col.text for col in row]
print(values)"""
for table in tables:
if "17.03.20" in table.text:
print(table)
import requests
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
response = requests.get("https://www.boerse.de/historische-kurse/Daimler-Aktie/DE0007100000")
# storing content of page
src = response.content
# create BeatifulSoup Object based on src
soup = BeautifulSoup(src, 'html.parser')
tables = soup.find_all("table")
for table in tables:
if "17.03.20" in table.text:
df = pd.read_html(str(table))[0]
row = df[df['Datum'] == "17.03.20"]
print (row)
Output:
print (row)
Datum Erster Schluss Hoch Tief ... Schluss Volumen Veränderung Veränderung
0 17.03.20 23,77 23,98 24,81 21,57 ... 2398 4.290.555 5,64% 5,64%
[1 rows x 7 columns]
To check if date is a Sunday:
You need to convert the string to datetime object. Then you can either, convert into a string that states the day of the week. Or check for the numerical value (6 is for Sunday)
import datetime
dateStr = '17.03.20'
date_object = datetime.datetime.strptime(dateStr, '%d.%m.%y')
print (date_object.strftime('%A'))
print (date_object.weekday()) # Sunday = 6, Saturday = 5
Output:
Tuesday
1