I am trying to scrape a table from espn and send the data to a pandas dataframe in order to export it to excel. I have completed most of the scraping, but am getting stuck on how to send each 'td' tag to a unique dataframe cell within my for loop. (Code is below) Any thoughts? Thanks!
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import os
import csv
import pandas as pd
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
soup = make_soup("http://www.espn.com/nba/statistics/player/_/stat/scoring-
per-game/sort/avgPoints/qualified/false")
regex = re.compile("^[e-o]")
for record in soup.findAll('tr', {"class":regex}):
for data in record.findAll('td'):
print(data)
I was actually recently scraping sports websites working on a daily fantasy sports algorithm for a class. This is the script I wrote up. Perhaps this approach can work for you. Build a dictionary. Convert it to a dataframe.
url = http://www.footballdb.com/stats/stats.html?lg=NFL&yr={0}&type=reg&mode={1}&limit=all
result = requests.get(url)
c = result.content
# Set as Beautiful Soup Object
soup = BeautifulSoup(c)
# Go to the section of interest
tables = soup.find("table",{'class':'statistics'})
data = {}
headers = {}
for i, header in enumerate(tables.findAll('th')):
data[i] = {}
headers[i] = str(header.get_text())
table = tables.find('tbody')
for r, row in enumerate(table.select('tr')):
for i, cell in enumerate(row.select('td')):
try:
data[i][r] = str(cell.get_text())
except:
stat = strip_non_ascii(cell.get_text())
data[i][r] = stat
for i, name in enumerate(tables.select('tbody .left .hidden-xs a')):
data[0][i] = str(name.get_text())
df = pd.DataFrame(data=data)
Related
need to scrape all the table data from rajya sabha website. however, instead of scraping from the url link the code scrapes the original table page by page
from selenium import webdriver
import chromedriver_binary
import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import csv
import time
import lxml
url = 'https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start=0'
#url_call = f"https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start={i}"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table1 = soup.find('table', id='sam_table')
headers = []
for a in table1.find_all('th'):
title = a.text
headers.append(title)
rsdata = pd.DataFrame(columns = headers)
rsdata.to_csv('rs_debate_data.csv', mode ='a',index=False)
# Create a for loop to fill rajya sabha data
for k in range(0,96):
url_call = f"https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start={k}"
page = requests.get(url_call)
for j in table1.find_all('tr')[1:]:
row_data = j.find_all('td')
row = [i.text for i in row_data]
length = len(rsdata)
rsdata.loc[length] = row
rsdata.to_csv('rs_debate_data.csv', mode ='a',index=False, header=False)
print(k)
# Export to csv
# Try to read csv
#rs_data = pd.read_csv('rs_debate_data.csv')
i was trying to scrape only rows related to keyword climate change in the debate title column of the table.
for k in range(0,96):
url_call = "..."
page = requests.get(url_call)
for j in table1.find_all('tr')[1:]:
This loop does a find_all() on the original table1 results, not on the page it just fetched...
I have the following code
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.ipma.pt/pt/otempo/obs.superficie/table-top-stations-all.jsp'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
# Get the content for tab_Co id
temp_table = soup.find('table', id='tab_Co')
# Create Headers
headers = []
for i in temp_table.find_all('th'):
title = i.text
headers.append(title)
# Create DataFrame with the headers as columns
mydata = pd.DataFrame(columns = headers)
# This is where the script goes wrong
# Create loop that retrieves information and appends it to the DataFrame
for j in table1.find_all('tr')[1:]:
row_data = j.find_all('td')
row = [i.text for i in row_data]
length = len(mydata)
mydata.loc[length] = row
What am I doing wrong? The final purpose is to have a dataframe where I can extract the top 4 values for each column
'Temperatura Max (ºC)',
'Temperatura Min (ºC)',
'Prec. acumulada (mm)',
'Rajada máxima (km/h)',
'Humidade Max (%)',
'Humidade Min (%)',
'Pressão atm. (hPa)']
and then use those to generate a daily image.
Any ideas? Thank you in advance!
Disclaimer: This is for a non-for-profit project and no commercial use will be made of the solution.
So this worked, based on this solution by Falsovsky on GitHub
# Import libraries
import requests
import pandas as pd
import regex
# Define target URL
url = 'https://www.ipma.pt/pt/otempo/obs.superficie/table-top-stations-all.jsp'
# Get URL information
page = requests.get(url)
# After inspecting the page apply a regex search
search = re.search('var observations = (.*?);',page.text,re.DOTALL);
# Create dict by loading the json information
json_data = json.loads(search.group(1))
# Create Dataframe from json result
df1 = pd.concat({k: pd.DataFrame(v).T for k, v in json_data.items()}, axis=0)
From the source view-source:https://www.ipma.pt/pt/otempo/obs.superficie/table-top-stations-all.jsp, it is clear that the data is in the th attributes so try scraping with row_data = j.find_all('th')
I have 10 links of companies.
https://www.zaubacorp.com/company/ASHRAFI-MEDIA-NETWORK-PRIVATE-LIMITED/U22120GJ2019PTC111757,
https://www.zaubacorp.com/company/METTLE-PUBLICATIONS-PRIVATE-LIMITED/U22120MH2019PTC329729,
https://www.zaubacorp.com/company/PRINTSCAPE-INDIA-PRIVATE-LIMITED/U22120MH2020PTC335354,
https://www.zaubacorp.com/company/CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665,
https://www.zaubacorp.com/company/BHOOKA-NANGA-FILMS-PRIVATE-LIMITED/U22130DL2019PTC353194,
https://www.zaubacorp.com/company/WHITE-CAMERA-SCHOOL-OF-PHOTOGRAPHY-PRIVATE-LIMITED/U22130JH2019PTC013311,
https://www.zaubacorp.com/company/RLE-PRODUCTIONS-PRIVATE-LIMITED/U22130KL2019PTC059208,
https://www.zaubacorp.com/company/CATALIZADOR-MEDIA-PRIVATE-LIMITED/U22130KL2019PTC059793,
https://www.zaubacorp.com/company/TRIPPLED-MEDIAWORKS-OPC-PRIVATE-LIMITED/U22130MH2019OPC333171,
https://www.zaubacorp.com/company/KRYSTAL-CINEMAZ-PRIVATE-LIMITED/U22130MH2019PTC330391
Now I am trying to scrape tables from these links and save the data in csv columns in well manner formet. I want to scrape tables of "Company Details", "Share Capital & Number of Employees", "Listing and Annual Compliance Details", "Contact Details", "Director Details". If any table has not the data or if any column is missing I want that column blank in output csv file. I have written a code but can't get the output. I am doing something wrong here. Please help
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import csv
import lxml
url_file = "Zaubalinks.txt"
with open(url_file, "r") as url:
url_pages = url.read()
# we need to split each urls into lists to make it iterable
pages = url_pages.split("\n") # Split by lines using \n
# now we run a for loop to visit the urls one by one
data = []
for single_page in pages:
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
table = soup.find_all('table') # finds all tables
table_top = pd.read_html(str(table))[0] # the top table
try: # try to get the other table if exists
table_capital = pd.read_html(str(table))[5]
table_listing = pd.read_html(str(table))[6]
table_contact = pd.read_html(str(table))[7]
table_director = pd.read_html(str(table))[8]
except:
table_capital = pd.DataFrame()
table_listing = pd.DataFrame()
table_contact = pd.DataFrame()
table_director = pd.DataFrame()
result = pd.concat([table_top, table_capital, table_listing, table_contact, table_director])
data.append(result)
print(data)
pd.concat(data).to_csv('ZaubaAll.csv')
import requests
from bs4 import BeautifulSoup
import pandas as pd
companies = {
'ASHRAFI-MEDIA-NETWORK-PRIVATE-LIMITED/U22120GJ2019PTC111757',
'METTLE-PUBLICATIONS-PRIVATE-LIMITED/U22120MH2019PTC329729',
'PRINTSCAPE-INDIA-PRIVATE-LIMITED/U22120MH2020PTC335354',
'CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665',
'BHOOKA-NANGA-FILMS-PRIVATE-LIMITED/U22130DL2019PTC353194',
'WHITE-CAMERA-SCHOOL-OF-PHOTOGRAPHY-PRIVATE-LIMITED/U22130JH2019PTC013311',
'RLE-PRODUCTIONS-PRIVATE-LIMITED/U22130KL2019PTC059208',
'CATALIZADOR-MEDIA-PRIVATE-LIMITED/U22130KL2019PTC059793',
'TRIPPLED-MEDIAWORKS-OPC-PRIVATE-LIMITED/U22130MH2019OPC333171',
'KRYSTAL-CINEMAZ-PRIVATE-LIMITED/U22130MH2019PTC330391'
}
def main(url):
with requests.Session() as req:
goal = []
for company in companies:
r = req.get(url.format(company))
df = pd.read_html(r.content)
target = pd.concat([df[x].T for x in [0, 3, 4]], axis=1)
goal.append(target)
new = pd.concat(goal)
new.to_csv("data.csv")
main("https://www.zaubacorp.com/company/{}")
Fortunatley, it seems you can get there with simpler methods. Taking one reandom link as an example, it should be something like:
url = 'https://www.zaubacorp.com/company/CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665'
import pandas as pd
tables = pd.read_html(url)
From here, your tables are in tables[0], tables[3], tables[4], tables[15], etc. Just use a for loop to rotate through all the urls.
I wanted to try to scrape some specific columns (Company details column) in the CNBC Nasdaq 100 website specifically the Adobe stocks, below is the snippet of my code
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
def get_company_info(url):
original_url = url
key = {}
l = []
page_response = requests.get(url, timeout=240)
page_content = BeautifulSoup(page_response.content, "html.parser")
name = page_content.find('div',{"class":"quote-section-header large-header"}).find("span",{"class":"symbol"}).text
description = page_content.find_all('div',{"class":"moduleBox"})
for items in description:
for i in range(len(items.find_all("tr"))-1):
# Gather data
key["stock_desc"] = items.find_all("td", {"class":"desc"})[i].find('div',attrs={'id':'descLong'}).text
shares = items.find_all("td").find("table",attrs={"id":"shares"})
for rest_of_items in shares:
for i in range(len(items.find_all("tr"))-1):
key["stock_outstanding-shares"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_ownership"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_market_cap"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_lastSplit"] = items.find_all("td", {"class":"bold aRit"})[i].text
# Print ("")
l.append(key)
key['name'] = name
df = pd.DataFrame(l)
print(df)
return key, df
get_company_info("https://www.cnbc.com/quotes/?symbol=ADBE&tab=profile")
So, I'm keen to get the result in dataframe so that I can change to CSV file, but my code keep showing empty dataframe result, Below are the error shown
The result I wanted is something like this
The information you are looking for is not available in the url you requested. This is because the information is fetched by the page using a JavaScript. Which in turn requests a different URL which provides the data.
Example code
from bs4 import BeautifulSoup
import requests
page=requests.get("https://apps.cnbc.com/view.asp?symbol=ADBE.O&uid=stocks/summary")
soup = BeautifulSoup(page.content, 'html.parser')
Name=soup.find("h5",id="companyName").text
stock_desc= soup.find("div",id="descLong").text
table=soup.find("table",id="shares")
details=table.find_all("td", class_="bold aRit")
stock_outstanding_shares= details[0].text
stock_ownership= details[1].text
stock_market_cap= details[2].text
stock_lastSplit= details[3].text
You can create dataframe and export to csv.
I am trying to get all the graphics card details into a csv file but not able to scrape the data(doing this as a project to scrape data for learning purposes). I am new to python and html.
I am using request and beautifulsoup libraries.
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
uClient = uReq(my_url)
Negg = uClient.read()
uClient.close
Complete_Graphics_New_Egg = soup(Negg,"html.parser")
Container_Main = Complete_Graphics_New_Egg.findAll("div",{"class":"item-container"})
Container_Main5 = str(Container_Main[5])
path_file='C:\\Users\\HP\\Documents\\Python\\Container_Main5.txt'
file_1 = open(path_file,'w')
file_1.write(Container_Main5)
file_1.close()
##Container_Main_details = Container_Main5.a
#div class="item-badges"
Container_5_1 = str(Container_Main[5].findAll("ul",{"class":"item-features"}))
path_file='C:\\Users\\HP\\Documents\\Python\\Container_test_5_1.txt'
file_5_1 = open(path_file,'w')
file_5_1.write(Container_5_1)
file_5_1.close()
Container_5_1.li
Container_5_2 = str(Container_Main[5].findAll("p",{"class":"item-promo"}))
path_file='C:\\Users\\HP\\Documents\\Python\\Container_test_5_2.txt'
file_5_2 = open(path_file,'w')
file_5_2.write(Container_5_2)
file_5_2.close()
##p class="item-promo"
##div class="item-info"
This should get you started. I'll break it down a bit too for you so you can modify and play while you're learning. I'm also suggesting to use Pandas, as it's a popular library for data manipulation and you'll be using in the near future if you're already not using it
I first initialize a results dataframe to store all the data you'll be parsing:
import bs4
import requests
import pandas as pd
results = pd.DataFrame()
Next, get the html form the site and pass that into BeautifulSoup:
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Then you had it find all the tags you were interested in. The only thing I added was have it iterate over each of those tags/elements it finds:
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
and then in each of those containers, grab the data you wanted from the item features and item promo. I store that data into a temporary dataframe (of 1 row) and then append that to my results dataframe. So after each iteration, the temp dataframe is overwritten with the new info, but the results won;t be overwritten, it'll just add on.
Lastly, use pandas to save the dataframe to csv.
results.to_csv('path/file.csv', index=False)
So full code:
import bs4
import requests
import pandas as pd
results = pd.DataFrame()
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
item_features = container.find("ul",{"class":"item-features"})
# if there are no item-fetures, move on to the next container
if item_features == None:
continue
temp_df = pd.DataFrame(index=[0])
features_list = item_features.find_all('li')
for feature in features_list:
split_str = feature.text.split(':')
header = split_str[0]
data = split_str[1].strip()
temp_df[header] = data
promo = container.find_all("p",{"class":"item-promo"})[0].text
temp_df['promo'] = promo
results = results.append(temp_df, sort = False).reset_index(drop = True)
results.to_csv('path/file.csv', index=False)