How to i append the output from beautifulsoup to a pandas dataframe - python

I am relatively new to python. I am planning to
a) obtain a list of URLs from the following url (https://aviation-safety.net/database/) with data from the year 1919 onwards (https://aviation-safety.net/database/dblist.php?Year=1919).
b) obtain the data (date, type, registration, opreator, fat., location, cat) from 1919 to current year
However, i ran into some problems and am still stuck in a)
Any form of help is appreciated, thank you so much!
#import packages
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
#start of code
mainurl = "https://aviation-safety.net/database/"
def getAndParseURL(mainurl):
result = requests.get(mainurl)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.find('a', href = True)
#try clause to go through the content and grab the URLs
try:
for row in datatable:
cols = row.find_all("|")
if len(cols) > 1:
links.append(x, cols = cols)
except: pass
#place links into numpy array
links_array = np.asarray(links)
len(links_array)
#check if links are in dataframe
df = pd.DataFrame(links_array)
df.columns = ['url']
df.head(10)
i can't seem to be able to get the URLs
would be great if i could get the following
S/N URL
1 https://aviation-safety.net/database/dblist.php?Year=1919
2 https://aviation-safety.net/database/dblist.php?Year=1920
3 https://aviation-safety.net/database/dblist.php?Year=1921

You're not extracting the href attributes from the tags you are pulling. What you want to do is find all <a> tags with links (which you did, but you need to use find_all as find will just return the first 1 it finds.) Then iterate through those tags. I choose to just have it look for the substring 'Year' and if it does, put that into the list.
#import packages
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
#start of code
mainurl = "https://aviation-safety.net/database/"
def getAndParseURL(mainurl):
result = requests.get(mainurl)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.find_all('a', href = True)
return datatable
datatable = getAndParseURL(mainurl)
#go through the content and grab the URLs
links = []
for link in datatable:
if 'Year' in link['href']:
url = link['href']
links.append(mainurl + url)
#check if links are in dataframe
df = pd.DataFrame(links, columns=['url'])
df.head(10)
Output:
df.head(10)
Out[24]:
url
0 https://aviation-safety.net/database/dblist.ph...
1 https://aviation-safety.net/database/dblist.ph...
2 https://aviation-safety.net/database/dblist.ph...
3 https://aviation-safety.net/database/dblist.ph...
4 https://aviation-safety.net/database/dblist.ph...
5 https://aviation-safety.net/database/dblist.ph...
6 https://aviation-safety.net/database/dblist.ph...
7 https://aviation-safety.net/database/dblist.ph...
8 https://aviation-safety.net/database/dblist.ph...
9 https://aviation-safety.net/database/dblist.ph...

Related

How to collect "td" text from list of lists and add them into the dictionary python beautifulSoup

Here I am trying to get the value of every column in the table shown in the picture (for three different pages) and store them in pandas dataframe. I have collected the data and now I have a list of lists, but when I try to add them to a dictionary I get empty dictionary. can anyone help me what I'm doing wrong or suggest an alternative way to create 3 dataframes, one for each table?
Here is my code:
import numpy as np
import pandas as pd
from datetime import datetime
import pytz
import requests
import json
from bs4 import BeautifulSoup
url_list = ['https://www.coingecko.com/en/coins/ethereum/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel',
'https://www.coingecko.com/en/coins/cardano/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel',
'https://www.coingecko.com/en/coins/chainlink/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel']
results = []
for url in url_list:
response = requests.get(url)
src = response.content
soup = BeautifulSoup(response.text , 'html.parser')
results.append(soup.find_all( "td",class_= "text-center"))
collected_data = dict()
for result in results:
for r in result:
datas = r.find_all("td", title=True)
for data in datas:
collected_data.setdefault(data.text)
collected_data
What happens?
In your first for loop your are only append the result set of soup.find_all( "td",class_= "text-center") to results.
So you wont find what you are looking for with datas = r.find_all("td", title=True)
Note also, that the column headers are not placed in <td> but in <th>.
How to fix?
You could select more specific, all <tr> in <tbody> to iterate over:
for row in soup.select('tbody tr'):
While iterating select the <th> and <td> and zip() it to dict() with the list of column headers:
data.append(
dict(zip([x.text for x in soup.select('thead th')], [x.text.strip() for x in row.select('th,td')]))
)
Example
import pandas as pd
import requests
from bs4 import BeautifulSoup
url_list = ['https://www.coingecko.com/en/coins/ethereum/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel',
'https://www.coingecko.com/en/coins/cardano/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel',
'https://www.coingecko.com/en/coins/chainlink/historical_data/usd?start_date=2021-08-06&end_date=2021-09-05#panel']
data = []
for url in url_list:
response = requests.get(url)
src = response.content
soup = BeautifulSoup(response.text , 'html.parser')
for row in soup.select('tbody tr'):
data.append(
dict(zip([x.text for x in soup.select('thead th')], [x.text.strip() for x in row.select('th,td')]))
)
pd.DataFrame(data)
Output
Date
Market Cap
Volume
Open
Close
2021-09-05
$456,929,768,632
$24,002,848,309
$3,894.94
N/A
2021-09-04
$462,019,852,288
$30,463,347,266
$3,936.16
$3,894.94
2021-09-03
$444,936,758,975
$28,115,776,510
$3,793.30
$3,936.16
EDIT
To get a data frame per url you can change the code to the following - It will append the frames to a list, so that you can iterat over to do things.
Note This is based on your comment and if it fits, okay. I would suggest to store the coin provider also as column, so you would be able to filter, group by, ... over all providers - But that should be asked in a new question, if matters.
dfList = []
for url in url_list:
response = requests.get(url)
src = response.content
soup = BeautifulSoup(response.text , 'html.parser')
data = []
coin = url.split("/")[5].upper()
for row in soup.select('tbody tr'):
data.append(
dict(zip([f'{x.text}_{coin}' for x in soup.select('thead th')], [x.text.strip() for x in row.select('th,td')]))
)
# if you like to save directly as csv... change next line to -> pd.DataFrame(data).to_csv(f'{coin}.csv')
dfList.append(pd.DataFrame(data))
Output
Select data frame by list index for example dfList[0]
Date_ETHEREUM
Market Cap_ETHEREUM
Volume_ETHEREUM
Open_ETHEREUM
Close_ETHEREUM
2021-09-05
$456,929,768,632
$24,002,848,309
$3,894.94
N/A
2021-09-04
$462,019,852,288
$30,463,347,266
$3,936.16
$3,894.94

TypeError: 'in <string>' requires string as left operand, not NoneType

I am trying to create a simple scraper to gatherbasketball stats. I was able to get the info I want, however, I can't figure out how to organized it all in a table.
I keep getting a "TypeError: 'in ' requires string as left operand, not NoneType."
Please see my code below:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
url = 'https://basketball.realgm.com/ncaa/boxscore/2021-01-29/North-Texas-at-Rice/367436'
page = requests.get(url)
soup = BeautifulSoup(page.content , 'html.parser')
#Extracting Columns
tables = soup.find('div', class_= 'boxscore-gamesummary')
columns = tables.find_all('th', class_='nosort')
#Extracting Stats
tables = soup.find('div', class_= 'boxscore-gamesummary')
stats = tables.find_all('td')
#Filling DataFrame
temp_df = pd.DataFrame(stats).transpose()
temp_df.columns = columns
final_df = pd.concat([final_df,temp_df], ignore_index=True)
final_df
Looking forward to hearing from someone
Pandas already has a built-in method to get a dataframe from HTML which should make things way easier here.
Code
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://basketball.realgm.com/ncaa/boxscore/2021-01-29/North-Texas-at-Rice/367436'
page = requests.get(url)
soup = BeautifulSoup(page.content , 'html.parser')
tables = soup.find('div', class_= 'boxscore-gamesummary').find_all('table')
df = pd.read_html(str(tables))[0]
print(df)
Output
Unnamed: 0 1 2 Final
0 UNT (8-5) 36 43 79
1 RU (10-7) 37 37 74

Adding href to panda .read_html DF

I want to create a table with the information available on this website. I want the table to have 3 columns: 0 series/date, 1 title and 2 links. I already managed to get the first two columns but I don't know how to get the link for each entry.
import pandas as pd
import requests
url = "http://legislaturautuado.com/pgs/resolutions.php?st=5&f=2016"
r = requests.get(url)
df_list = pd.read_html(r.text)
df = df_list[0]
df.head()
Will it be possible to get what I want by only using pandas?
As far as I know, it's not possible with pandas only. It can be done with BeautifulSoup, though:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "http://legislaturautuado.com/pgs/resolutions.php?st=5&f=2016"
r = requests.get(url)
html_table = BeautifulSoup(r.text).find('table')
r.close()
df = pd.read_html(str(html_table), header=0)[0]
df['Link'] = [link.get('href') for link in html_table.find_all('a')]

Html table scraping using beautifulsoup

I am trying to a table from SEC filling 10-K, I think it is going allright except the part where pandas converting it to dataframes as I am new to data frames so I think making mistake in indexing, Please help me on this as I am getting fallowing error "IndexError: index 2 is out of bounds for axis 0 with size 2"
I am using this programming
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.sec.gov/Archives/edgar/data/1022344/000155837017000934/spg-20161231x10k.htm#Item8FinancialStatementsandSupplementary'
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc, 'lxml')
table = soup.find_all('table')[0]
new_table = pd.DataFrame(columns=range(0,2), index = [0])
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
new_table.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
new_table
If dataframe issue isn't resolvable than please suggest any other substitute like writing data to csv/excel also any sugesstion for extracting mutiple table at once will be really helpful

Appending links to new rows in pandas df after using beautifulsoup

I'm attempting to extract some links from a chunk of beautiful soup html and append them to rows of a new pandas dataframe.
So far, I have this code:
url = "http://www.reed.co.uk/jobs
datecreatedoffset=Today&isnewjobssearch=True&pagesize=100"
r = ur.urlopen(url).read()
soup = BShtml(r, "html.parser")
adcount = soup.find_all("div", class_="pages")
print(adcount)
From my output I then want to take every link, identified by href="" and store each one in a new row of a pandas dataframe.
Using the above snippet I would end up with 6 rows in my new dataset.
Any help would be appreciated!
Your links gives a 404 but the logic should be the same as below. You just need to extract the anchor tags with the page class and join them to the base url:
import pandas as pd
from urlparse import urljoin
import requests
base = "http://www.reed.co.uk/jobs"
url = "http://www.reed.co.uk/jobs?keywords=&location=&jobtitleonly=false"
r = requests.get(url).content
soup = BeautifulSoup(r, "html.parser")
df = pd.DataFrame(columns=["links"], data=[urljoin(base, a["href"]) for a in soup.select("div.pages a.page")])
print(df)
Which gives you:
links
0 http://www.reed.co.uk/jobs?cached=True&pageno=2
1 http://www.reed.co.uk/jobs?cached=True&pageno=3
2 http://www.reed.co.uk/jobs?cached=True&pageno=4
3 http://www.reed.co.uk/jobs?cached=True&pageno=5
4 http://www.reed.co.uk/jobs?cached=True&pageno=...
5 http://www.reed.co.uk/jobs?cached=True&pageno=2

Categories

Resources