I have the following code
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.ipma.pt/pt/otempo/obs.superficie/table-top-stations-all.jsp'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
# Get the content for tab_Co id
temp_table = soup.find('table', id='tab_Co')
# Create Headers
headers = []
for i in temp_table.find_all('th'):
title = i.text
headers.append(title)
# Create DataFrame with the headers as columns
mydata = pd.DataFrame(columns = headers)
# This is where the script goes wrong
# Create loop that retrieves information and appends it to the DataFrame
for j in table1.find_all('tr')[1:]:
row_data = j.find_all('td')
row = [i.text for i in row_data]
length = len(mydata)
mydata.loc[length] = row
What am I doing wrong? The final purpose is to have a dataframe where I can extract the top 4 values for each column
'Temperatura Max (ºC)',
'Temperatura Min (ºC)',
'Prec. acumulada (mm)',
'Rajada máxima (km/h)',
'Humidade Max (%)',
'Humidade Min (%)',
'Pressão atm. (hPa)']
and then use those to generate a daily image.
Any ideas? Thank you in advance!
Disclaimer: This is for a non-for-profit project and no commercial use will be made of the solution.
So this worked, based on this solution by Falsovsky on GitHub
# Import libraries
import requests
import pandas as pd
import regex
# Define target URL
url = 'https://www.ipma.pt/pt/otempo/obs.superficie/table-top-stations-all.jsp'
# Get URL information
page = requests.get(url)
# After inspecting the page apply a regex search
search = re.search('var observations = (.*?);',page.text,re.DOTALL);
# Create dict by loading the json information
json_data = json.loads(search.group(1))
# Create Dataframe from json result
df1 = pd.concat({k: pd.DataFrame(v).T for k, v in json_data.items()}, axis=0)
From the source view-source:https://www.ipma.pt/pt/otempo/obs.superficie/table-top-stations-all.jsp, it is clear that the data is in the th attributes so try scraping with row_data = j.find_all('th')
Related
I am trying to scrape data from stathead.com, basketball-reference.com's new subscription service. When using my normal approach that I would've used on BR, it won't scrape the first 10 rows, or 21-100 rows, only 11-20. Any thoughts? For example, stats only returns a subset of the full data.
url = "https://stathead.com/basketball/lineup_finder.cgi?request=1&match=single&order_by_asc=0&order_by=diff_pts&lineup_type=2-man&output=per_poss&is_playoffs=N&year_id=2015&ccomp%5B1%5D=gt&cval%5B1%5D=100&cstat%5B1%5D=mp&game_month=0&game_num_min=0&game_num_max=99"
html = urlopen(url)
soup = BeautifulSoup(html)
rows = soup.findAll('tr')[1:]
headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')][1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
for i in range(len(rows))]
stats = pd.DataFrame(player_stats, columns = headers)
** you can try the below code and later filter out the required data.**
import pandas as pd
url = 'https://stathead.com/basketball/lineup_finder.cgi?request=1&match=single&order_by_asc=0&order_by=diff_pts&lineup_type=2-man&output=per_poss&is_playoffs=N&year_id=2015&ccomp%5B1%5D=gt&cval%5B1%5D=100&cstat%5B1%5D=mp&game_month=0&game_num_min=0&game_num_max=99'
df = pd.read_html(url)
print(df)
I have 10 links of companies.
https://www.zaubacorp.com/company/ASHRAFI-MEDIA-NETWORK-PRIVATE-LIMITED/U22120GJ2019PTC111757,
https://www.zaubacorp.com/company/METTLE-PUBLICATIONS-PRIVATE-LIMITED/U22120MH2019PTC329729,
https://www.zaubacorp.com/company/PRINTSCAPE-INDIA-PRIVATE-LIMITED/U22120MH2020PTC335354,
https://www.zaubacorp.com/company/CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665,
https://www.zaubacorp.com/company/BHOOKA-NANGA-FILMS-PRIVATE-LIMITED/U22130DL2019PTC353194,
https://www.zaubacorp.com/company/WHITE-CAMERA-SCHOOL-OF-PHOTOGRAPHY-PRIVATE-LIMITED/U22130JH2019PTC013311,
https://www.zaubacorp.com/company/RLE-PRODUCTIONS-PRIVATE-LIMITED/U22130KL2019PTC059208,
https://www.zaubacorp.com/company/CATALIZADOR-MEDIA-PRIVATE-LIMITED/U22130KL2019PTC059793,
https://www.zaubacorp.com/company/TRIPPLED-MEDIAWORKS-OPC-PRIVATE-LIMITED/U22130MH2019OPC333171,
https://www.zaubacorp.com/company/KRYSTAL-CINEMAZ-PRIVATE-LIMITED/U22130MH2019PTC330391
Now I am trying to scrape tables from these links and save the data in csv columns in well manner formet. I want to scrape tables of "Company Details", "Share Capital & Number of Employees", "Listing and Annual Compliance Details", "Contact Details", "Director Details". If any table has not the data or if any column is missing I want that column blank in output csv file. I have written a code but can't get the output. I am doing something wrong here. Please help
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import csv
import lxml
url_file = "Zaubalinks.txt"
with open(url_file, "r") as url:
url_pages = url.read()
# we need to split each urls into lists to make it iterable
pages = url_pages.split("\n") # Split by lines using \n
# now we run a for loop to visit the urls one by one
data = []
for single_page in pages:
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
table = soup.find_all('table') # finds all tables
table_top = pd.read_html(str(table))[0] # the top table
try: # try to get the other table if exists
table_capital = pd.read_html(str(table))[5]
table_listing = pd.read_html(str(table))[6]
table_contact = pd.read_html(str(table))[7]
table_director = pd.read_html(str(table))[8]
except:
table_capital = pd.DataFrame()
table_listing = pd.DataFrame()
table_contact = pd.DataFrame()
table_director = pd.DataFrame()
result = pd.concat([table_top, table_capital, table_listing, table_contact, table_director])
data.append(result)
print(data)
pd.concat(data).to_csv('ZaubaAll.csv')
import requests
from bs4 import BeautifulSoup
import pandas as pd
companies = {
'ASHRAFI-MEDIA-NETWORK-PRIVATE-LIMITED/U22120GJ2019PTC111757',
'METTLE-PUBLICATIONS-PRIVATE-LIMITED/U22120MH2019PTC329729',
'PRINTSCAPE-INDIA-PRIVATE-LIMITED/U22120MH2020PTC335354',
'CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665',
'BHOOKA-NANGA-FILMS-PRIVATE-LIMITED/U22130DL2019PTC353194',
'WHITE-CAMERA-SCHOOL-OF-PHOTOGRAPHY-PRIVATE-LIMITED/U22130JH2019PTC013311',
'RLE-PRODUCTIONS-PRIVATE-LIMITED/U22130KL2019PTC059208',
'CATALIZADOR-MEDIA-PRIVATE-LIMITED/U22130KL2019PTC059793',
'TRIPPLED-MEDIAWORKS-OPC-PRIVATE-LIMITED/U22130MH2019OPC333171',
'KRYSTAL-CINEMAZ-PRIVATE-LIMITED/U22130MH2019PTC330391'
}
def main(url):
with requests.Session() as req:
goal = []
for company in companies:
r = req.get(url.format(company))
df = pd.read_html(r.content)
target = pd.concat([df[x].T for x in [0, 3, 4]], axis=1)
goal.append(target)
new = pd.concat(goal)
new.to_csv("data.csv")
main("https://www.zaubacorp.com/company/{}")
Fortunatley, it seems you can get there with simpler methods. Taking one reandom link as an example, it should be something like:
url = 'https://www.zaubacorp.com/company/CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665'
import pandas as pd
tables = pd.read_html(url)
From here, your tables are in tables[0], tables[3], tables[4], tables[15], etc. Just use a for loop to rotate through all the urls.
I'm doing some progress with web scraping however I still need some help to perform some operations:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
# soup = BeautifulSoup(requests.get(converturl).content, 'html.parser')
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
out = []
for tr in soup.select('.col-md-4 tbody tr'):
On the class col-md-4 I know there are 3 tables I want to generate a csv which as an output has three values: first name, last name, and for the last value I want the header name of the table.
first name, last name, header table
Any help would be appreciated.
This is what I have done on my own:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
filename = url.rsplit('/', 1)[1] + '.csv'
tables = soup.select('.col-md-4 table')
rows = []
for tr in tables:
t = tr.get_text(strip=True, separator='|').split('|')
rows.append(t)
df = pd.DataFrame(rows)
print(df)
df.to_csv(filename)
Thanks,
This might work:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
tables = soup.select('.col-md-4 table')
rows = []
for table in tables:
cleaned = list(table.stripped_strings)
header, names = cleaned[0], cleaned[1:]
data = [name.split(', ') + [header] for name in names]
rows.extend(data)
result = pd.DataFrame.from_records(rows, columns=['surname', 'name', 'table'])
You need to first iterate through each table you want to scrape, then for each table, get its header and rows of data. For each row of data, you want to parse out the First Name and Last Name (along with the header of the table).
Here's a verbose working example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
out = []
# Iterate through each of the three tables
for table in soup.select(".col-md-4 table"):
# Grab the header and rows from the table
header = table.select("thead th")[0].text.strip()
rows = [s.text.strip() for s in table.select("tbody tr")]
t = [] # This list will contain the rows of data for this table
# Iterate through rows in this table
for row in rows:
# Split by comma (last_name, first_name)
split = row.split(",")
last_name = split[0].strip()
first_name = split[1].strip()
# Create the row of data
t.append([first_name, last_name, header])
# Convert list of rows to a DataFrame
df = pd.DataFrame(t, columns=["first_name", "last_name", "table_name"])
# Append to list of DataFrames
out.append(df)
# Write to CSVs...
out[0].to_csv("first_table.csv", index=None) # etc...
Whenever you're web scraping, I highly recommend using strip() on all of the text you parse to make sure you don't have superfluous spaces in your data.
I hope this helps!
I wanted to try to scrape some specific columns (Company details column) in the CNBC Nasdaq 100 website specifically the Adobe stocks, below is the snippet of my code
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
def get_company_info(url):
original_url = url
key = {}
l = []
page_response = requests.get(url, timeout=240)
page_content = BeautifulSoup(page_response.content, "html.parser")
name = page_content.find('div',{"class":"quote-section-header large-header"}).find("span",{"class":"symbol"}).text
description = page_content.find_all('div',{"class":"moduleBox"})
for items in description:
for i in range(len(items.find_all("tr"))-1):
# Gather data
key["stock_desc"] = items.find_all("td", {"class":"desc"})[i].find('div',attrs={'id':'descLong'}).text
shares = items.find_all("td").find("table",attrs={"id":"shares"})
for rest_of_items in shares:
for i in range(len(items.find_all("tr"))-1):
key["stock_outstanding-shares"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_ownership"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_market_cap"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_lastSplit"] = items.find_all("td", {"class":"bold aRit"})[i].text
# Print ("")
l.append(key)
key['name'] = name
df = pd.DataFrame(l)
print(df)
return key, df
get_company_info("https://www.cnbc.com/quotes/?symbol=ADBE&tab=profile")
So, I'm keen to get the result in dataframe so that I can change to CSV file, but my code keep showing empty dataframe result, Below are the error shown
The result I wanted is something like this
The information you are looking for is not available in the url you requested. This is because the information is fetched by the page using a JavaScript. Which in turn requests a different URL which provides the data.
Example code
from bs4 import BeautifulSoup
import requests
page=requests.get("https://apps.cnbc.com/view.asp?symbol=ADBE.O&uid=stocks/summary")
soup = BeautifulSoup(page.content, 'html.parser')
Name=soup.find("h5",id="companyName").text
stock_desc= soup.find("div",id="descLong").text
table=soup.find("table",id="shares")
details=table.find_all("td", class_="bold aRit")
stock_outstanding_shares= details[0].text
stock_ownership= details[1].text
stock_market_cap= details[2].text
stock_lastSplit= details[3].text
You can create dataframe and export to csv.
I am trying to scrape a table from espn and send the data to a pandas dataframe in order to export it to excel. I have completed most of the scraping, but am getting stuck on how to send each 'td' tag to a unique dataframe cell within my for loop. (Code is below) Any thoughts? Thanks!
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import os
import csv
import pandas as pd
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
soup = make_soup("http://www.espn.com/nba/statistics/player/_/stat/scoring-
per-game/sort/avgPoints/qualified/false")
regex = re.compile("^[e-o]")
for record in soup.findAll('tr', {"class":regex}):
for data in record.findAll('td'):
print(data)
I was actually recently scraping sports websites working on a daily fantasy sports algorithm for a class. This is the script I wrote up. Perhaps this approach can work for you. Build a dictionary. Convert it to a dataframe.
url = http://www.footballdb.com/stats/stats.html?lg=NFL&yr={0}&type=reg&mode={1}&limit=all
result = requests.get(url)
c = result.content
# Set as Beautiful Soup Object
soup = BeautifulSoup(c)
# Go to the section of interest
tables = soup.find("table",{'class':'statistics'})
data = {}
headers = {}
for i, header in enumerate(tables.findAll('th')):
data[i] = {}
headers[i] = str(header.get_text())
table = tables.find('tbody')
for r, row in enumerate(table.select('tr')):
for i, cell in enumerate(row.select('td')):
try:
data[i][r] = str(cell.get_text())
except:
stat = strip_non_ascii(cell.get_text())
data[i][r] = stat
for i, name in enumerate(tables.select('tbody .left .hidden-xs a')):
data[0][i] = str(name.get_text())
df = pd.DataFrame(data=data)