Crawling data from the web then restructure to a Pandas DataFrame - python

I have a code like this:
import os
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
URL_TEMPLATES ='https://freemeteo.vn/thoi-tiet/ngoc-ha/history/daily-history/?gid=1572463&station=11376&date={}-{:02d}-{:02d}&language=vietnamese&country=vietnam' #%loc
urls = URL_TEMPLATES.format(2015,1,1)
html_docs = requests.get(urls).text
soups = BeautifulSoup(html_doc)
tables = soup.find(class_='table-list')
tables
Then the results like this:
<div class="table-list">
<ul><li>Yên Phụ</li>
<li>Hữu Tiệp</li>
Can anyone help me to create tables to pandas DataFrame to easy to handle like I can select 'Yên Phụ' string? Thanks

You can parse the tables directly with pandas.
import pandas as pd
url = 'https://freemeteo.vn/thoi-tiet/ngoc-ha/history/daily-history/?gid=1572463&station=11376&date={}-{:02d}-{:02d}&language=vietnamese&country=vietnam'
tables = pd.read_html(url)
That will give you a list of dataframes. Each table on the page will be one dataframe.
Then you can query a dataframe like:
tables[0].query("Tên == 'Hà Nội'")
If you just wanted the cities in table-list div:
resp = requests.get(url)
soup = BeautifulSoup(resp.text)
table_list = soup.find('div', {'class': 'table-list'})
names, links = [], []
for city in table_list.find_all('a'):
names.append(city.text)
links.append(city['href'])
To turn the above 2 lists into a dataframe:
df = pd.DataFrame(zip(names, links), columns=['City', 'Link'])

Related

Python: How to Webscrape All Rows from a Specific Table

For practice, I am trying to webscrape financial data from one table in this url: https://www.macrotrends.net/stocks/charts/TSLA/tesla/revenue
I'd like to save the data from the "Tesla Quarterly Revenue" table into a data frame and return two columns: Data, Revenue.
Currently the code as it runs now is grabbing data from the adjacent table, "Tesla Annual Revenue." Since the tables don't seem to have unique id's from which to separate them in this instance, how would I select elements only from the "Tesla Quarterly Revenue" table?
Any help or insight on how to remedy this would be deeply appreciated.
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.macrotrends.net/stocks/charts/TSLA/tesla/revenue"
html_data = requests.get(url).text
soup = BeautifulSoup(html_data, 'html5lib')
tesla_revenue = pd.DataFrame(columns=["Date", "Revenue"])
for row in soup.find("tbody").find_all("tr"):
col = row.find_all("td")
date = col[0].text
revenue = col[1].text
tesla_revenue = tesla_revenue.append({"Date":date, "Revenue":revenue},ignore_index=True)
tesla_revenue.head()
Below are the results when I run this code:
You can let pandas do all the work
import pandas as pd
url = "https://www.macrotrends.net/stocks/charts/TSLA/tesla/revenue"
tables = pd.read_html(url)
for df in tables:
# loop over all found tables
pass
# quarterly revenue is the second table
df = tables[1]
df.columns = ['Date', 'Revenue'] # rename the columns if you want to
print(df)

TypeError: 'in <string>' requires string as left operand, not NoneType

I am trying to create a simple scraper to gatherbasketball stats. I was able to get the info I want, however, I can't figure out how to organized it all in a table.
I keep getting a "TypeError: 'in ' requires string as left operand, not NoneType."
Please see my code below:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
url = 'https://basketball.realgm.com/ncaa/boxscore/2021-01-29/North-Texas-at-Rice/367436'
page = requests.get(url)
soup = BeautifulSoup(page.content , 'html.parser')
#Extracting Columns
tables = soup.find('div', class_= 'boxscore-gamesummary')
columns = tables.find_all('th', class_='nosort')
#Extracting Stats
tables = soup.find('div', class_= 'boxscore-gamesummary')
stats = tables.find_all('td')
#Filling DataFrame
temp_df = pd.DataFrame(stats).transpose()
temp_df.columns = columns
final_df = pd.concat([final_df,temp_df], ignore_index=True)
final_df
Looking forward to hearing from someone
Pandas already has a built-in method to get a dataframe from HTML which should make things way easier here.
Code
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://basketball.realgm.com/ncaa/boxscore/2021-01-29/North-Texas-at-Rice/367436'
page = requests.get(url)
soup = BeautifulSoup(page.content , 'html.parser')
tables = soup.find('div', class_= 'boxscore-gamesummary').find_all('table')
df = pd.read_html(str(tables))[0]
print(df)
Output
Unnamed: 0 1 2 Final
0 UNT (8-5) 36 43 79
1 RU (10-7) 37 37 74

Adding href to panda .read_html DF

I want to create a table with the information available on this website. I want the table to have 3 columns: 0 series/date, 1 title and 2 links. I already managed to get the first two columns but I don't know how to get the link for each entry.
import pandas as pd
import requests
url = "http://legislaturautuado.com/pgs/resolutions.php?st=5&f=2016"
r = requests.get(url)
df_list = pd.read_html(r.text)
df = df_list[0]
df.head()
Will it be possible to get what I want by only using pandas?
As far as I know, it's not possible with pandas only. It can be done with BeautifulSoup, though:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "http://legislaturautuado.com/pgs/resolutions.php?st=5&f=2016"
r = requests.get(url)
html_table = BeautifulSoup(r.text).find('table')
r.close()
df = pd.read_html(str(html_table), header=0)[0]
df['Link'] = [link.get('href') for link in html_table.find_all('a')]

Get xml from webservice?

I'm trying to get a data from this site
and then use some of it. Sorry for not copy-paste it but it's a long xml. So far I tried to get this data those ways:
from urllib.request import urlopen
url = "http://degra.wi.pb.edu.pl/rozklady/webservices.php?"
s = urlopen(url)
content = s.read()
as print(content) looks good, now I would like to get a data from it
<tabela_rozklad data-aktualizacji="1480583567">
<DZIEN>2</DZIEN>
<GODZ>3</GODZ>
<ILOSC>2</ILOSC>
<TYG>0</TYG>
<ID_NAUCZ>66</ID_NAUCZ>
<ID_SALA>79</ID_SALA>
<ID_PRZ>104</ID_PRZ>
<RODZ>W</RODZ>
<GRUPA>1</GRUPA>
<ID_ST>13</ID_ST>
<SEM>1</SEM>
<ID_SPEC>0</ID_SPEC>
</tabela_rozklad>
How can I handle this data to easy use it?
You can use Beautiful soup and capture the tags you want. The code below should get you started!
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "http://degra.wi.pb.edu.pl/rozklady/webservices.php?"
# secure url content
response = requests.get(url).content
soup = BeautifulSoup(response)
# find each tabela_rozklad
tables = soup.find_all('tabela_rozklad')
# for each tabela_rozklad looks like there is 12 nested corresponding tags
tags = ['dzien', 'godz', 'ilosc', 'tyg', 'id_naucz', 'id_sala',
'id_prz', 'rodz', 'grupa', 'id_st', 'sem', 'id_spec']
# initialize empty dataframe
df = pd.DataFrame()
# iterate over each tabela_rozklad and extract each tag and append to pandas dataframe
for table in tables:
all = map(lambda x: table.find(x).text, tags)
df = df.append([all])
# insert tags as columns
df.columns = tags
# display first 5 rows of table
df.head()
# and the shape of the data
df.shape # 665 rows, 12 columns
# and now you can get to the information using traditional pandas functionality
# for instance, count observations by rodz
df.groupby('rodz').count()
# or subset only observations where rodz = J
J = df[df.rodz == 'J']

Convert a cell into different columns & rows of the DataFrame

I am trying to scrape a table out of a site. I have tried to convert data_row to a Pandas DataFrame; however, all the data are lumped in one cell of the DataFrame. Would you guys please help me convert the data_row into a Pandas DataFrame with "Business Mileage, "Charitable Mileage," "Medical mileage," and "Moving mileage" as rows and "2016," "2015," "2014," "2013," "2012," "2011," and "2010" as columns ?
from bs4 import BeautifulSoup
import urllib2
import pandas as pd
r = urllib2.urlopen('http://www.smbiz.com/sbrl003.html#cmv')
soup = BeautifulSoup(r)
print soup.prettify()
data_row = soup.findAll('pre')[0:1]

Categories

Resources