I'm attempting to scrape the list of tickers for the Nasdaq 100 from the CNBC website: https://www.cnbc.com/nasdaq-100/. I am new to beautiful soup, but if there is a more straight forward way to scrape the list and save the data I am interested in any solution.
The code below does not return an error;however, it does not return any tickers either.
import bs4 as bs
import pickle # serializes any python object so that we do not have to go back to the CNBC website to get the tickers each time we want
# to use the 100 ticker symbols
import requests
def save_nasdaq_tickers():
''' We start by getting the source code for CNBC. We will use the request module for this'''
resp = requests.get('https://www.cnbc.com/nasdaq-100')
soup = bs.BeautifulSoup(resp.text,"lxml")# we use txt when the response comes from request module I think because resp.txt is text of source code.
table = soup.find('table',{'class':"data quoteTable"}) # We want all table of the class we think matches the table data we want from cnbc
tickers = [] # empty tickers list
# Next week iterate through the table.
for row in table.findAll('tr')[1:]:# we want to find all table rows except the header row which should be row 0 so 1 onward [:1]
ticker = row.findAll('td')[0].txt #td is the columns of the table 0 is the first column which I perceived to be the tickers
# We specifiy .txt because it is a soup object
tickers.append(ticker)
# Save this list of tickers using pickle and with open???
with open("Nasdaq100Tickers","wb") as f: # name the file Nasdaq100... etc
pickle.dump(tickers,f) # dumping the tickers to file f
print(tickers)
return tickers
save_nasdaq_tickers()
Just a small wrong in your code if you wonder why you got nothing in your tickers. ticker = row.findAll('td')[0].txt to ticker = row.findAll('td')[0].text. But when you desire to get full content in dynamic page, you need selenium.
def save_nasdaq_tickers():
try:
dr = webdriver.Chrome()
dr.get("https://www.cnbc.com/nasdaq-100")
text = dr.page_source
except Exception as e:
raise e
finally:
dr.close()
soup = bs.BeautifulSoup(text,"lxml")
table = soup.find('table',{'class':"data quoteTable"})
You can mimic the XHR request made and parse out the JSON containing the data you are after
import requests
import pandas as pd
import json
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup
url = 'https://quote.cnbc.com/quote-html-webservice/quote.htm?partnerId=2&requestMethod=quick&exthrs=1&noform=1&fund=1&output=jsonp&symbols=AAL|AAPL|ADBE|ADI|ADP|ADSK|ALGN|ALXN|AMAT|AMGN|AMZN|ATVI|ASML|AVGO|BIDU|BIIB|BMRN|CDNS|CELG|CERN|CHKP|CHTR|CTRP|CTAS|CSCO|CTXS|CMCSA|COST|CSX|CTSH&callback=quoteHandler1'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
s = soup.select('html')[0].text.strip('quoteHandler1(').strip(')')
data= json.loads(s)
data = json_normalize(data)
df = pd.DataFrame(data)
print(df[['symbol','last']])
Returns JSON as follows (sample expanded):
Related
I've recently started to play with BeautifulSoup in order to extract specific data in HTML and convert to a pandas dataframe. I've been stuck as the data I am returning is a string and not a list.
My goal is to get all info under the results tag in the HTML report in their original data type (list of dict) so I can iterate over and return a data frame.
Here is the code:
import requests
from bs4 import BeautifulSoup
countriy_code = 'BE'
city = 'BRUSSELS'.upper()
code_postal = '1000'
max_price = '250000'
URL = f'https://www.immoweb.be/en/search/apartment/for-sale?countries={countriy_code}&districts={city}&postalCodes={code_postal}&maxPrice={max_price}&orderBy=relevance'
print(URL)
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
page_results = soup.find('iw-search')
page_attributes = page_results.attrs
for key, items in page_attributes.items():
print(type(key), type(items))
The raw HTML looks like (note it's not the full message):
<iw-search :anchor-card-id="null" :criteria='{"countries":"BE","districts":[{"queryValue":"BRUSSELS","queryParam":"districts","slug":"brussels","shortLabel":"Brussels (District)","label":"Brussels (District)","translations":{"fr":"Bruxelles","en":"Brussels","nl":"Brussel"}}],"maxPrice":250000,"postalCodes":[{"queryValue":"1000","queryParam":"postalCodes","slug":"brussels-city","shortLabel":"Brussels City (1000)","label":"Brussels City (1000)","translations":{"fr":"Bruxelles ville","en":"Brussels City","nl":"Brussel"}}],"propertyTypes":"APARTMENT","transactionTypes":"FOR_SALE"}' :geo-point-count="876" :labellized-search='"Apartment for sale - Brussels (District) (and 1 more)"' :marketing-count="1457" :page="1" :result-count="1457" :results='[{"id":9103642,"cluster":{"minPrice":null,"maxPrice":null,"minRoom":null,"maxRoom":null,"minSurface":null,"maxSurface":null,"projectInfo":null,"bedroomRange":"","surfaceRange":""},"customerLogoUrl":"https:\/\/static.immoweb.be\/logos\/145409.gif?cache=2016051503060","customerName":"Expertissimmo","flags":{"main":"under_option","secondary":[],"percentSold":null},"media":{"pictures":[{"smallUrl":"https:\/\/static.immoweb.be\/photos\/0\/9\/1\/0\/3\/6\/4\/2\/9103642_1.gif?cache=20210106041435","mediumUrl":"https:\/\/static.immoweb.be\/photos\/0\/9\/1\/0\/3\/6\/4\/2\/M_9103642_1.jpg?cache=20210106041435","largeUrl":"https:\/\/static.immoweb.be\/photos\/0\/9\/1\/0\/3\/6\/4\/2\/9103642_1.jpg?cache=20210106041435","isVertical":false},{"smallUrl":"https:\/\/static.immoweb.be\/photos\/0\/9\/1\/0\/3\/6\/4\/2\/9103642_2.gif?cache=20210106041435","mediumUrl":```
thanks for your return
You can get a list of dictionary objects using json (the string is valid json)
import json
res = json.loads(page_attributes[':results'])
I have 10 links of companies.
https://www.zaubacorp.com/company/ASHRAFI-MEDIA-NETWORK-PRIVATE-LIMITED/U22120GJ2019PTC111757,
https://www.zaubacorp.com/company/METTLE-PUBLICATIONS-PRIVATE-LIMITED/U22120MH2019PTC329729,
https://www.zaubacorp.com/company/PRINTSCAPE-INDIA-PRIVATE-LIMITED/U22120MH2020PTC335354,
https://www.zaubacorp.com/company/CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665,
https://www.zaubacorp.com/company/BHOOKA-NANGA-FILMS-PRIVATE-LIMITED/U22130DL2019PTC353194,
https://www.zaubacorp.com/company/WHITE-CAMERA-SCHOOL-OF-PHOTOGRAPHY-PRIVATE-LIMITED/U22130JH2019PTC013311,
https://www.zaubacorp.com/company/RLE-PRODUCTIONS-PRIVATE-LIMITED/U22130KL2019PTC059208,
https://www.zaubacorp.com/company/CATALIZADOR-MEDIA-PRIVATE-LIMITED/U22130KL2019PTC059793,
https://www.zaubacorp.com/company/TRIPPLED-MEDIAWORKS-OPC-PRIVATE-LIMITED/U22130MH2019OPC333171,
https://www.zaubacorp.com/company/KRYSTAL-CINEMAZ-PRIVATE-LIMITED/U22130MH2019PTC330391
Now I am trying to scrape tables from these links and save the data in csv columns in well manner formet. I want to scrape tables of "Company Details", "Share Capital & Number of Employees", "Listing and Annual Compliance Details", "Contact Details", "Director Details". If any table has not the data or if any column is missing I want that column blank in output csv file. I have written a code but can't get the output. I am doing something wrong here. Please help
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import csv
import lxml
url_file = "Zaubalinks.txt"
with open(url_file, "r") as url:
url_pages = url.read()
# we need to split each urls into lists to make it iterable
pages = url_pages.split("\n") # Split by lines using \n
# now we run a for loop to visit the urls one by one
data = []
for single_page in pages:
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
table = soup.find_all('table') # finds all tables
table_top = pd.read_html(str(table))[0] # the top table
try: # try to get the other table if exists
table_capital = pd.read_html(str(table))[5]
table_listing = pd.read_html(str(table))[6]
table_contact = pd.read_html(str(table))[7]
table_director = pd.read_html(str(table))[8]
except:
table_capital = pd.DataFrame()
table_listing = pd.DataFrame()
table_contact = pd.DataFrame()
table_director = pd.DataFrame()
result = pd.concat([table_top, table_capital, table_listing, table_contact, table_director])
data.append(result)
print(data)
pd.concat(data).to_csv('ZaubaAll.csv')
import requests
from bs4 import BeautifulSoup
import pandas as pd
companies = {
'ASHRAFI-MEDIA-NETWORK-PRIVATE-LIMITED/U22120GJ2019PTC111757',
'METTLE-PUBLICATIONS-PRIVATE-LIMITED/U22120MH2019PTC329729',
'PRINTSCAPE-INDIA-PRIVATE-LIMITED/U22120MH2020PTC335354',
'CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665',
'BHOOKA-NANGA-FILMS-PRIVATE-LIMITED/U22130DL2019PTC353194',
'WHITE-CAMERA-SCHOOL-OF-PHOTOGRAPHY-PRIVATE-LIMITED/U22130JH2019PTC013311',
'RLE-PRODUCTIONS-PRIVATE-LIMITED/U22130KL2019PTC059208',
'CATALIZADOR-MEDIA-PRIVATE-LIMITED/U22130KL2019PTC059793',
'TRIPPLED-MEDIAWORKS-OPC-PRIVATE-LIMITED/U22130MH2019OPC333171',
'KRYSTAL-CINEMAZ-PRIVATE-LIMITED/U22130MH2019PTC330391'
}
def main(url):
with requests.Session() as req:
goal = []
for company in companies:
r = req.get(url.format(company))
df = pd.read_html(r.content)
target = pd.concat([df[x].T for x in [0, 3, 4]], axis=1)
goal.append(target)
new = pd.concat(goal)
new.to_csv("data.csv")
main("https://www.zaubacorp.com/company/{}")
Fortunatley, it seems you can get there with simpler methods. Taking one reandom link as an example, it should be something like:
url = 'https://www.zaubacorp.com/company/CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665'
import pandas as pd
tables = pd.read_html(url)
From here, your tables are in tables[0], tables[3], tables[4], tables[15], etc. Just use a for loop to rotate through all the urls.
I wanted to try to scrape some specific columns (Company details column) in the CNBC Nasdaq 100 website specifically the Adobe stocks, below is the snippet of my code
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
def get_company_info(url):
original_url = url
key = {}
l = []
page_response = requests.get(url, timeout=240)
page_content = BeautifulSoup(page_response.content, "html.parser")
name = page_content.find('div',{"class":"quote-section-header large-header"}).find("span",{"class":"symbol"}).text
description = page_content.find_all('div',{"class":"moduleBox"})
for items in description:
for i in range(len(items.find_all("tr"))-1):
# Gather data
key["stock_desc"] = items.find_all("td", {"class":"desc"})[i].find('div',attrs={'id':'descLong'}).text
shares = items.find_all("td").find("table",attrs={"id":"shares"})
for rest_of_items in shares:
for i in range(len(items.find_all("tr"))-1):
key["stock_outstanding-shares"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_ownership"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_market_cap"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_lastSplit"] = items.find_all("td", {"class":"bold aRit"})[i].text
# Print ("")
l.append(key)
key['name'] = name
df = pd.DataFrame(l)
print(df)
return key, df
get_company_info("https://www.cnbc.com/quotes/?symbol=ADBE&tab=profile")
So, I'm keen to get the result in dataframe so that I can change to CSV file, but my code keep showing empty dataframe result, Below are the error shown
The result I wanted is something like this
The information you are looking for is not available in the url you requested. This is because the information is fetched by the page using a JavaScript. Which in turn requests a different URL which provides the data.
Example code
from bs4 import BeautifulSoup
import requests
page=requests.get("https://apps.cnbc.com/view.asp?symbol=ADBE.O&uid=stocks/summary")
soup = BeautifulSoup(page.content, 'html.parser')
Name=soup.find("h5",id="companyName").text
stock_desc= soup.find("div",id="descLong").text
table=soup.find("table",id="shares")
details=table.find_all("td", class_="bold aRit")
stock_outstanding_shares= details[0].text
stock_ownership= details[1].text
stock_market_cap= details[2].text
stock_lastSplit= details[3].text
You can create dataframe and export to csv.
I am trying to get all the graphics card details into a csv file but not able to scrape the data(doing this as a project to scrape data for learning purposes). I am new to python and html.
I am using request and beautifulsoup libraries.
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
uClient = uReq(my_url)
Negg = uClient.read()
uClient.close
Complete_Graphics_New_Egg = soup(Negg,"html.parser")
Container_Main = Complete_Graphics_New_Egg.findAll("div",{"class":"item-container"})
Container_Main5 = str(Container_Main[5])
path_file='C:\\Users\\HP\\Documents\\Python\\Container_Main5.txt'
file_1 = open(path_file,'w')
file_1.write(Container_Main5)
file_1.close()
##Container_Main_details = Container_Main5.a
#div class="item-badges"
Container_5_1 = str(Container_Main[5].findAll("ul",{"class":"item-features"}))
path_file='C:\\Users\\HP\\Documents\\Python\\Container_test_5_1.txt'
file_5_1 = open(path_file,'w')
file_5_1.write(Container_5_1)
file_5_1.close()
Container_5_1.li
Container_5_2 = str(Container_Main[5].findAll("p",{"class":"item-promo"}))
path_file='C:\\Users\\HP\\Documents\\Python\\Container_test_5_2.txt'
file_5_2 = open(path_file,'w')
file_5_2.write(Container_5_2)
file_5_2.close()
##p class="item-promo"
##div class="item-info"
This should get you started. I'll break it down a bit too for you so you can modify and play while you're learning. I'm also suggesting to use Pandas, as it's a popular library for data manipulation and you'll be using in the near future if you're already not using it
I first initialize a results dataframe to store all the data you'll be parsing:
import bs4
import requests
import pandas as pd
results = pd.DataFrame()
Next, get the html form the site and pass that into BeautifulSoup:
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Then you had it find all the tags you were interested in. The only thing I added was have it iterate over each of those tags/elements it finds:
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
and then in each of those containers, grab the data you wanted from the item features and item promo. I store that data into a temporary dataframe (of 1 row) and then append that to my results dataframe. So after each iteration, the temp dataframe is overwritten with the new info, but the results won;t be overwritten, it'll just add on.
Lastly, use pandas to save the dataframe to csv.
results.to_csv('path/file.csv', index=False)
So full code:
import bs4
import requests
import pandas as pd
results = pd.DataFrame()
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
item_features = container.find("ul",{"class":"item-features"})
# if there are no item-fetures, move on to the next container
if item_features == None:
continue
temp_df = pd.DataFrame(index=[0])
features_list = item_features.find_all('li')
for feature in features_list:
split_str = feature.text.split(':')
header = split_str[0]
data = split_str[1].strip()
temp_df[header] = data
promo = container.find_all("p",{"class":"item-promo"})[0].text
temp_df['promo'] = promo
results = results.append(temp_df, sort = False).reset_index(drop = True)
results.to_csv('path/file.csv', index=False)
I am trying to scrape a table from espn and send the data to a pandas dataframe in order to export it to excel. I have completed most of the scraping, but am getting stuck on how to send each 'td' tag to a unique dataframe cell within my for loop. (Code is below) Any thoughts? Thanks!
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import os
import csv
import pandas as pd
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
soup = make_soup("http://www.espn.com/nba/statistics/player/_/stat/scoring-
per-game/sort/avgPoints/qualified/false")
regex = re.compile("^[e-o]")
for record in soup.findAll('tr', {"class":regex}):
for data in record.findAll('td'):
print(data)
I was actually recently scraping sports websites working on a daily fantasy sports algorithm for a class. This is the script I wrote up. Perhaps this approach can work for you. Build a dictionary. Convert it to a dataframe.
url = http://www.footballdb.com/stats/stats.html?lg=NFL&yr={0}&type=reg&mode={1}&limit=all
result = requests.get(url)
c = result.content
# Set as Beautiful Soup Object
soup = BeautifulSoup(c)
# Go to the section of interest
tables = soup.find("table",{'class':'statistics'})
data = {}
headers = {}
for i, header in enumerate(tables.findAll('th')):
data[i] = {}
headers[i] = str(header.get_text())
table = tables.find('tbody')
for r, row in enumerate(table.select('tr')):
for i, cell in enumerate(row.select('td')):
try:
data[i][r] = str(cell.get_text())
except:
stat = strip_non_ascii(cell.get_text())
data[i][r] = stat
for i, name in enumerate(tables.select('tbody .left .hidden-xs a')):
data[0][i] = str(name.get_text())
df = pd.DataFrame(data=data)