Integers, Float and Beautiful Soup Attribute - python

I'm trying to get my all_data tags into a format where I can do a comparison with them with a Boolean. I think it involves using the float and/or an int operator. However, I have some concerns as the output once the site is scrapped. The output is in integers, decimals and percentages. The specific line I am talking about modifying is line 33. I have tried using int() and .int. I haven't found any questions on Stackoverflow about this or in Beautiful Soup documentation.
from BeautifulSoup import BeautifulSoup
import csv
import re
import urllib
import urllib2
from urllib2 import HTTPError
# import modules
symbolfile = open("symbols.txt")
symbolslist = symbolfile.read()
newsymbolslist = symbolslist.split("\n")
i = 0
f = csv.writer(open("pe_ratio.csv","wb"))
# short cut to write
f.writerow(["Name","PE","Revenue % Quarterly","ROA% YOY","Operating Cashflow","Debt to Equity"])
#first write row statement
# define name_company as the following
while i<len(newsymbolslist):
try:
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
except urllib2.HTTPError:
continue
soup = BeautifulSoup(page)
name_company = soup.findAll("div", {"class" : "title"})
for name in name_company: #add multiple iterations?
all_data = soup.findAll('td', "yfnc_tabledata1")
stock_name = name.find('h2').string #find company's name in name_company with h2 tag
try:
f.writerow([stock_name, all_data[2].getText(),all_data[17].getText(),all_data[13].getText(), all_data[29].getText(),all_data[26].getText()]) #write down PE data
except (IndexError, HTTPError) as e:
pass
i+=1
This is what the output looks like in the CSV file.
Agilent Technologies Inc. (A) 25.7 -2.80% 5.60% N/A 51.03
Please keep in mind you load the stock ticker symbols by putting them in vertically in the symbols.txt file.

To convert your all_data string values to numbers try something like this:
all_data = soup.findAll('td', "yfnc_tabledata1")
stock_name = name.find('h2').string #find company's name in name_company with h2 tag
clean_data = list()
for x in [data.GetText().strip(' %') for data in all_data]
try:
clean_data.append(float(x))
except ValueError:
clean_data.append(x)
try:
f.writerow([stock_name, clean_data[2], clean_data[17], clean_data[13], clean_data[29], clean_data[26]]) #write down PE data
except (IndexError, HTTPError) as e:
pass

If you want to run comparisons on the data (i.e. is quarterly percent greater than 25) you'll have to format the text so it can be converted to a number
quarterly_percent = all_data[17].getText()
if quarterly_percent != "N/A":
#cut off the percent sign and conver to a "python number"
quarterly_percent = float(quarterly_percent[:-1])
if quarterly_percent > 25:
print "its a good one"

Related

How do I get the code to read a different row besides 1?

I'm trying to take data from a site and paste it in excel. (suitable super noob project!)
I'm able to get the first row to write properly to excel, BUT, on the excel sheet it just returns the first row (case keenum data) and on the terminal it repeats case keenum's data 250 times.
NO error message.
I tried inserting a "return True" after the append, but it said it needed to be part of a function. I looked many places, but my scenario is highly unique and i suspect it has to do with my lack of knowledge of the science.
here's my code thus far;
from bs4 import BeautifulSoup
import requests
import openpyxl
excel = openpyxl.Workbook()
print(excel.sheetnames)
sheet = excel.active
sheet.title = 'HOF'
print(excel.sheetnames)
sheet.append(['name', 'yards', 'year1', 'year2'])
try:
source = requests.get('https://www.sports-reference.com/cfb/leaders/pass-yds-player-career.html')
source.raise_for_status()
soup = any = BeautifulSoup(source.text, 'html.parser')
quarterbacks = soup.find('tbody').find_all('tr')
print(len(quarterbacks))
for quarterback in quarterbacks:
name = soup.find('td', class_="left").a.text
yards = soup.find('td', class_="right").get_text(strip=True)
year1 = soup.find('td', class_="center").get_text(strip=True)
year2 = soup.find('td', class_="center").get_text(strip=True)
print(name,yards,year1,year2)
sheet.append([name, yards, year1, year2])
except Exception as e:
print(e)
excel.save('qbdatascrape.xlsx')
I would let pandas do that work for you:
import pandas as pd
try:
df = pd.read_html('https://www.sports-reference.com/cfb/leaders/pass-yds-player-career.html')[0]
df = df.rename(columns={'Player':'name',
'Yds':'yards',
'From':'year1',
'To':'year2'})
df = df[['name','yards','year1','year2']]
df.to_excel('qbdatascrape.xlsx', sheet_name='HOF', index=False)
except Exception as e:
print(e)

Python Jupyter - Crawling eBay website , nested try & except ERROR

im trying to get the PRICE from 1000 pages in eBay, but i have 3 different of ID tag to the Price. I'v tried few options but i got "IndentationError" , or when i got the output.csv , got a A column of empty cells.
def get_detail_data(soup): # Price
try:
try:
try:
p = soup.find('span', id='prcIsum').text.strip() ##Rid off the spaces,also split the string by space
except:
p = soup.find('span', id='prcIsum_bidPrice').text.strip()
except:
p = soup.find('span', id='mm-saleDscPrc').text.strip()
except:
currency , price = p.split(' ') ##to get 2 elements
except:
currency = ''
price=''
The indentation levels of the try and except need to be matched.
This should fix the indentation error, though I am not sure if it will make the code do what you want:
try:
try:
try:
p = soup.find('span', id='prcIsum').text.strip() ##Rid off the spaces,also split the string by space
except:
p = soup.find('span', id='prcIsum_bidPrice').text.strip()
except:
p = soup.find('span', id='mm-saleDscPrc').text.strip()
currency , price = p.split(' ') ##to get 2 elements
except:
currency = ''
price=''

For Loop only prints the first value

I am trying to web scrape stock data using a for loop on a list of five stocks. The problem is only the first value is returned five times. I have tried appending to a list but it still doesn't work, although clearly I am not appending correctly. On the website, I want to get the data for Operating Cash which comes in the form of 14B or 1B for example, which is why I have removed the B and multiplied that number to get a raw value. Here is my code:
import requests
import yfinance as yf
import pandas as pd
from bs4 import BeautifulSoup
headers = {'User Agent':'Mozilla/5.0'}
stocks = ['AMC','AMD','PFE','AAPL', 'NVDA']
finished_list = []
for stock in stocks:
url = f'https://www.marketwatch.com/investing/stock/{stock}/financials/cash-flow'
res = requests.get(url)
soup = BeautifulSoup(res.content, 'lxml')
operating_cash = soup.findAll('div', class_ = "cell__content")[134].text
finished_list.append(operating_cash)
if 'B' in operating_cash:
cash1 = operating_cash.replace('B','')
if '(' in cash1:
cash2 = cash1.replace('(','-')
if ')' in cash2:
cash3 = cash2.replace(')','')
cash3 = float(cash3)
print(cash3*1000000000)
else:
cash1 = float(cash1)
print(cash1 * 1000000000)
The current output is -1060000000.0 five times in a row which is the correct value for operating cash for AMC but not for the other four. Thanks in advance to anyone who can help me out.
You don't need to use if conditions for str.replace(). Instead, do all your replacements in one line like so:
for stock in stocks:
url = f'https://www.marketwatch.com/investing/stock/{stock}/financials/cash-flow'
res = requests.get(url)
soup = BeautifulSoup(res.content, 'lxml')
operating_cash = soup.findAll('div', class_ = "cell__content")[134].text
finished_list.append(operating_cash)
cash = float(operating_cash.replace('B','').replace('(','-').replace(')',''))
print(cash*1000000000)
-1060000000.0
1070000000.0000001
14400000000.0
80670000000.0
5820000000.0

Fix the syntax error of a list comprehension that contains beautiful soup methods

I tried hard but there is always some syntax error with the piece of code that follows.
import urllib.request
import re
import csv
from bs4 import BeautifulSoup
from bs4 import NavigableString
from unicodedata import normalize
url = input('Please paste the link here: ')
html = urllib.request.urlretrieve(url)
html_file = open(html[0])
soup = BeautifulSoup(html_file, 'html5lib')
def contains_href(tag):
return tag.find('a', href=True)
scrollables = [table in soup.find_all('table', class_='sc_courselist') if contains_href(table)]
def num_name_unit(tag):
td_num = tag.find('td', href=True)
num = normalize('NFKD', td_num.string.strip())
td_name = tag.find('td', class_=False)
name = normalize('NFKD', td_name.string.strip())
td_unit = tag.find('td', class_='hourscol')
unit = normalize('NFKD', td_unit.string.strip())
row = ['Course Number: {0} | Course Name: {1} | Course Unit: {2}'.format(num, name, unit)]
return row
dic_rows = {scrollable.find_previous_siblings(re.compile('h'), class_=False, limit=1).string.strip(): list(num_name_unit(tr) for tr in scrollable.find_all('tr', contains_href)) for scrollable in scrollables}
I expect that the terminal would print the following request: "Please paste the link here: ". In reality, it says "invalid syntax" at the end of scrollables = [table in soup.find_all('table', class_='sc_courselist') if contains_href(table)].
enter image description here
You are missing the for part in your list. It should be
[table for table in soup.find_all('table', class_='sc_courselist') if contains_href(table)]

Fetching the first image from a website that belongs to the post

I've written a program that fetches the desired information from a blog or any page. The next thing, I want to achieve is to retrieve the first image from that page, that belongs to the respective post (Just like Facebook does when a post is shared).
I was able to achieve this to some extent by fetching the first image with an alt tag (since many websites don't have alt tags in their logos and icons etc, the first one should belong to the post). But this does not seem to work in some cases. Is there any other (better) way to achieve this?
I'm using python 2.7.9 and BeautifulSoup 4.
d = feedparser.parse('http://rss.cnn.com/rss/edition.rss')
for entry in d.entries:
try:
if entry.title is not None:
print entry.title
print ""
except Exception, e:
print e
try:
if entry.link is not None:
print entry.link
print ""
except Exception, e:
print e
try:
if entry.published[5:16] is not None:
print entry.published[5:16]
print ""
except Exception, e:
print e
try:
if entry.category is not None:
print entry.category
print ""
except Exception, e:
print e
try:
if entry.get('summary', '') is not None:
print entry.get('summary', '')
print ""
except Exception, e:
print e
time.sleep(5)
r = requests.get(entry.link, headers = {'User-Agent' : 'Safari/534.55.3 '})
soup = BeautifulSoup(r.text, 'html.parser')
for img in soup.findAll('img'):
if img.has_attr('alt'):
if img['src'].endswith('.jpg') == True or img['src'].endswith('.png') == True:
print img['src']
break
It is probably more practical to take a look at the opengraph module:
https://pypi.python.org/pypi/opengraph/0.5
and correct it the way you like.
It will fetch "first image" from HTML code or use og:image.
If you want to learn, you can also do it by looking at the source code. The module uses BeautifulSoup too.
I needed the following monkeypatch to activate scraping as fallback:
import re
from bs4 import BeautifulSoup
from opengraph import OpenGraph
def parser(self, html):
"""
"""
if not isinstance(html,BeautifulSoup):
doc = BeautifulSoup(html, from_encoding='utf-8')
else:
doc = html
ogs = doc.html.head.findAll(property=re.compile(r'^og'))
for og in ogs:
self[og[u'property'][3:]]=og[u'content']
# Couldn't fetch all attrs from og tags, try scraping body
if not self.is_valid() and self.scrape:
for attr in self.required_attrs:
if not hasattr(self, attr):
try:
self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
except AttributeError:
pass
OpenGraph.parser = parser
OpenGraph.scrape = True # workaround for some subtle bug in opengraph
You may need to handle relatives URLs in the image sources, but it is quite straightforward with use of urljoin from urlparse
import opengraph
...
page = opengraph.OpenGraph(url=link, scrape=True)
...
if page.is_valid():
...
image_url = page.get('image', None)
...
if not image_url.startswith('http'):
image_url = urljoin(page['_url'], page['image'])
(some check are omitted for brevity from the code fragment)

Categories

Resources