Parsing stock recommended rating from Yahoo stock site - python

I'm looking to parse a specific Yahoo stock page using a Python script (take https://finance.yahoo.com/quote/NOA?ltr=1 for example) and print the "Recommended Rating" to a file. Recommended rating can be found on the right hand side of the page about half way down.
This is what I have so far
try:
import urllib.request as urllib2
except ImportError:
import urllib2
from bs4 import BeautifulSoup
quote_page = 'https://finance.yahoo.com/quote/NOA?ltr=1'
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, "html.parser")
name_box = soup.find(attrs={'div': 'rating-text Arrow South Fw(b) Bgc($strongBuy) Bdtc($strongBuy)'})
name = name_box.text.strip()
print(name)
The tricky part is that I believe the recommended rating is only listed on the page as InnerHTML. I'm not sure how i'd go about retrieving this data, a push in the right direction would be greatly appreciated!

Yahoo makes a get request to the url in the script below for some of their data. If you look in the network tab of the developer tools and refresh the page for NOA stock you should see 'NOA?formatt...'. Click this and then view the response object to see some of the data. You'll need the requests module for the script below to work: pip install requests.
# get_mean_recs.py
import csv
from datetime import datetime
import requests
import sys
get_date = lambda : datetime.utcnow().strftime('%d-%m-%Y')
lhs_url = 'https://query2.finance.yahoo.com/v10/finance/quoteSummary/'
rhs_url = '?formatted=true&crumb=swg7qs5y9UP&lang=en-US&region=US&' \
'modules=upgradeDowngradeHistory,recommendationTrend,' \
'financialData,earningsHistory,earningsTrend,industryTrend&' \
'corsDomain=finance.yahoo.com'
def get_mean_rec(ticker):
url = lhs_url + ticker + rhs_url
r = requests.get(url)
if not r.ok:
return -1
result = r.json()['quoteSummary']['result'][0]
return result['financialData']['recommendationMean']['fmt']
def read_from_csv(fn):
with open(fn, 'r') as f:
reader = csv.reader(f)
for line in reader:
for ticker in line:
yield ticker
def write_to_csv(fn, data):
with open(fn, 'a') as f:
fieldnames = data[0].keys()
writer = csv.DictWriter(f, fieldnames=fieldnames)
for item in data:
writer.writerow(item)
def assemble_dict(ticker):
return {
'ticker': ticker,
'mean_rec': get_mean_rec(ticker),
'utc_date': get_date()
}
def main():
in_fn = sys.argv[1]
out_fn = sys.argv[2]
data = [assemble_dict(ticker) for ticker in read_from_csv(in_fn)]
write_to_csv(out_fn, data)
if __name__ == '__main__':
main()
Usage:
python get_mean_recs.py input.csv output.csv

There is an API for accessing the yahoo finance information, e.g.
http://finance.yahoo.com/d/quotes.csv?s=NOA&f=snd1l1yr
I think you may be better off using that to fetch the required information. Some more info on the parameters can be found here:
http://wern-ancheta.com/blog/2015/04/05/getting-started-with-the-yahoo-finance-api/

Related

Stuck Scraping with Beautifulsoup

So i'm trying to scrape a html webpage. It has novel chapters and i'm trying to get the text and store in text files to read offline. I don't have any previous experience with html or other things either. So the webpage I am trying to scrape is this. And the code i've been testing so far looks like this
`
import sys
import requests
import time
import re
from bs4 import BeautifulSoup
def browse_and_scrape(seed_url, page_number=1):
# Fetch the URL - We will be using this to append to images and info routes
url_pat = re.compile(r"(http://.*\.org)")
source_url = url_pat.search(seed_url).group(0)
# Page_number from the argument gets formatted in the URL & Fetched
formatted_url = seed_url.format(str(page_number))
# print(url_pat,source_url,formatted_url)
try:
html_text = requests.get(formatted_url).text
# print(html_text)
# Prepare the soup
soup = BeautifulSoup(html_text, "html.parser")
print(soup.find_all(id="chapterContent")[0]["style"])
print(f"Now Scraping - {formatted_url}")
# help = soup.find_all("div",class_="chapter-content text-normal")[0].text.strip().encode("ascii", "ignore").decode("ascii")
# for node in soup.findAll("div",class_="chapter-content text-normal"):
# print(node)
# print(''.join(node.findAll(text=True)))
# for node in soup.findAll("div"):
# # print(node)
# print(''.join(node.findAll(text=True)))
# help = soup.find_all("div",class_="chapter-content text-normal")[0]
# print(''.join(help.findAll(text=True)))
# print(help)
except Exception as e:
return e
return true
if __name__ == "__main__":
# seed_url = "http://books.toscrape.com/catalogue/page-{}.html"
seed_url = "http://wnmtl.org/chapter/324909-heavenly-wolf-valley.html"
# seed_url = "http://wnmtl.org/chapter/{}.html"
print("Web scraping has begun")
result = browse_and_scrape(seed_url)
if result == True:
print("Web scraping is now complete!")
else:
print(f"Oops, That doesn't seem right!!! - {result}")`
All the commented stuff are things i've been trying to rip the text from the tag. From my inspection of the developer console in the browser, all the text is in the tag with id of chapter content. My plan is to iteratively get the text, stuff it, get the link for the next page and repeat but i've been stuck for a bit now, any suggestions.
Instead of scraping each page, you can directly get the text from this API endpoint using requests.
https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/324909
The last item in the above API is the chapter ID (324909). You can navigate to chapters by giving in the chapter IDs.
The next and prev chapter IDs are present in the current chapter's API endpoint. Have a look at the above URL in browser to understand it better.
Here is the full recursive code that writes the text from 3 pages to a file called novel.txt. You may change the number of pages and other details as per your need.
import requests
def get_data(chapter_id, pages):
if pages == 0:
return
url = 'https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/' + str(chapter_id)
r = requests.get(url)
x = r.json()
pre_id = x['data']['preId']
next_id = x['data']['nextId']
title = x['data']['title']
content = x['data']['content']
chapter_title = f'\n***** Chapter: {title} *****\n'
with open('novel.txt', 'a') as f:
f.write(chapter_title)
f.write(content + '\n')
print(f"Chapter: '{title}' written to file.")
get_data(next_id, pages-1)
curr_id = '324909'
get_data(curr_id, 3)
Chapter: 'Heavenly Wolf Valley' written to file.
Chapter: 'Leaving' written to file.
Chapter: 'Pure Fabrication' written to file.

I'm not able to split my code into functions

I made a code to download pdfs from a website, and it works perfectly, downloading all the PDF's (first code below). However, when I split my code into functions, only two links are inserted into the "papers" list and the execution ends with code zero, but the following warning message appears:
GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
The code that caused this warning is on line 11 of the file C:\Downloads\EditoraCL\download_pdf.py. To get rid of this warning, pass the additional argument 'features="html.parser"' to the BeautifulSoup constructor.
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
FIRST CODE:
import requests
import httplib2
import os
from bs4 import BeautifulSoup, SoupStrainer
papers = []
pdfs = []
http = httplib2.Http()
status, response = http.request('https://www.snh2021.anpuh.org/site/anais')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
papers.append(link['href'])
print(papers)
for x in papers:
if x.endswith('pdf'):
pdfs.append(x)
print(pdfs)
def baixa_arquivo(url, endereco):
resposta = requests.get(url)
if resposta.status_code == requests.codes.OK:
with open(endereco, 'wb') as novo_arquivo:
novo_arquivo.write(resposta.content)
print('Download concluĂ­do. Salvo em {}'.format(endereco))
else:
resposta.raise_for_status()
if __name__ == '__main__':
url_basica = 'https://www.snh2021.anpuh.org/{}'
output = 'Download'
for i in range(1, len(pdfs)):
nome_do_arquivo = os.path.join(output, 'artigo{}.pdf'.format(i))
a = pdfs[i]
z = url_basica.format(a)
y = requests.get(z)
if y.status_code!=404:
baixa_arquivo(z, nome_do_arquivo)
CODE DIVIDED INTO FUNCTIONS:
import requests
import httplib2
import os
from bs4 import BeautifulSoup, SoupStrainer
papers = []
pdfs = []
def busca_links():
http = httplib2.Http()
status, response = http.request('https://www.snh2021.anpuh.org/site/anais')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
papers.append(link['href'])
return papers
def links_pdf():
for x in papers:
if x.endswith('pdf'):
pdfs.append(x)
return pdfs
def baixa_arquivo(url, endereco):
resposta = requests.get(url)
if resposta.status_code == requests.codes.OK:
with open(endereco, 'wb') as novo_arquivo:
novo_arquivo.write(resposta.content)
return f'Download concluĂ­do. Salvo em {endereco}'
else:
resposta.raise_for_status()
if __name__ == '__main__':
busca_links()
links_pdf()
url_basica = 'https://www.snh2021.anpuh.org/{}'
output = 'Download'
print(papers)
print(pdfs)
for i in range(1, len(pdfs)):
nome_do_arquivo = os.path.join(output, 'artigo{}.pdf'.format(i))
a = pdfs[i]
z = url_basica.format(a)
y = requests.get(z)
if y.status_code!=404:
baixa_arquivo(z, nome_do_arquivo)
Could someone help me understand why the second code is giving this error?
Functions do not share their inner variables, so in order to make your code work, you should assign "papers" to the function itself, after returning it inside the function ( papers = busca_links() and links_pdf(papers) ).
Anyway, for the purpose of organization and clearer code, you should use classes and methods:
import os
import requests
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
class Pdf:
def __init__(self, base_url, url):
self.main_dir = os.path.dirname(__file__)
self.pdfs_dir = os.path.join(self.main_dir, 'pdfs')
self.base_url = base_url
self.url = url
def get_links(self):
http = httplib2.Http()
status, response = http.request(self.url)
self.links = []
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if link['href'].endswith('pdf'):
self.links.append(f"{self.base_url}{link['href']}")
def download_pdf(self):
for link in self.links:
response = requests.get(link, stream=True)
if response.status_code == 200:
file_path = os.path.join(self.pdfs_dir, link.split('/')[-1])
with open(file_path, 'wb') as f:
f.write(response.content)
print('Success. Saved on {}'.format(file_path))
else:
# Should handle errors here, by appending them to a list and
# trying again later.
print('Error.')
if __name__ == '__main__':
base_url = 'https://www.snh2021.anpuh.org/'
url = f'{base_url}site/anais'
pdf = Pdf(base_url, url)
pdf.get_links()
pdf.download_pdf()

AttributeError: 'NoneType' object has no attribute 'tbody' - Spyder 3.3.1 / beautifulsoup4 / python 3.6

Hey this is my setup: Spyder 3.3.1 / beautifulsoup4 / python 3.6
The below code is from an article on medium (here) about webscraping with python and Beautifulsoup. Was supposed to be a quick read but now TWO days later I still cant not get the code to run in spyder and keep getting:
File "/Users/xxxxxxx/Documents/testdir/swiftScrape.py", line 9, in table_to_df
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
AttributeError: 'NoneType' object has no attribute 'tbody'
Not sure what is going wrong and seems to be an implementation error. Can anyone assist in sheding some light on this issue.
Thanks in advance.
import os
import bs4
import requests
import pandas as pd
PATH = os.path.join("C:\\","Users","xxxxx","Documents","tesdir")
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=None, sep=';', encoding='iso-8859-1')
url = next_page(soup)
counter += 1
Like a lost of example code found on the web, this code is not production-grade code - it blindly assumes that http requests always succeed and returns the expected content. The truth is that it's quite often not the case (network errors, proxies or firewall that blocks you, site down - temporarily or definitely, updates in the site that changed either the urls and/or the page's markup etc).
Your problem manifests itself here:
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
and comes from table actually being None, which means that here in the for loop:
table = soup.find(name='table', attrs={'id':'tableID'})
there was no "table" tag with id "tableID" found in the html document. You can check this by printing the actual html content:
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continye
# etc
Thanks #bruno desthuilliers for your pointers. Much appreciated.
This is the rewritten code that worked for me using Selenium and webdriver rather than import requests:
import os
import bs4
import pandas as pd
from selenium import webdriver
PATH = os.path.join('/','Users','benmorris','documents','testdir')
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.find_all('td')] for row in soup.find_all('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
driver = webdriver.Chrome()
driver.get(url)
while True:
print(counter)
page = driver.get(url)
soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
table = driver.find_element_by_xpath('//*[#id="tableID"]')
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continue
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=False, sep=',', encoding='iso-8859-1')
url = next_page(soup)
counter += 1

how to speed up my process

I wrote a script that will web scrape data for a list of stocks. The scraper has to get the data from 2 separate pages so each stock symbol must scrape 2 different pages. If I run the process on a list that is 1000 items long it will take around 30 minutes to complete. It's not horrible, I can set it and forget it, but I'm wondering if there is a way to speed up the process. Maybe store the data and wait to write it all at the end instead of on each loop? Any other ideas appreciated.
import requests
from BeautifulSoup import BeautifulSoup
from progressbar import ProgressBar
import csv
symbols = {'AMBTQ','AABA','AAOI','AAPL','AAWC','ABEC','ABQQ','ACFN','ACIA','ACIW','ACLS'}
pbar = ProgressBar()
with open('industrials.csv', "ab") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerow(['Symbol','5 Yr EPS','EPS TTM'])
for s in pbar(symbols):
try:
url1 = 'https://research.tdameritrade.com/grid/public/research/stocks/fundamentals?symbol='
full1 = url1 + s
response1 = requests.get(full1)
html1 = response1.content
soup1 = BeautifulSoup(html1)
for hist_div in soup1.find("div", {"data-module-name": "HistoricGrowthAndShareDetailModule"}):
EPS5yr = hist_div.find('label').text
except Exception as e:
EPS5yr = 'Bad Data'
pass
try:
url2 = 'https://research.tdameritrade.com/grid/public/research/stocks/summary?symbol='
full2 = url2 + s
response2 = requests.get(full2)
html2 = response2.content
soup2 = BeautifulSoup(html2)
for div in soup2.find("div", {"data-module-name": "StockSummaryModule"}):
EPSttm = div.findAll("dd")[11].text
except Exception as e:
EPSttm = "Bad data"
pass
writer.writerow([s,EPS5yr,EPSttm])

Scrape page with generator

I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1

Categories

Resources