Python Jupyter - Crawling eBay website , nested try & except ERROR - python

im trying to get the PRICE from 1000 pages in eBay, but i have 3 different of ID tag to the Price. I'v tried few options but i got "IndentationError" , or when i got the output.csv , got a A column of empty cells.
def get_detail_data(soup): # Price
try:
try:
try:
p = soup.find('span', id='prcIsum').text.strip() ##Rid off the spaces,also split the string by space
except:
p = soup.find('span', id='prcIsum_bidPrice').text.strip()
except:
p = soup.find('span', id='mm-saleDscPrc').text.strip()
except:
currency , price = p.split(' ') ##to get 2 elements
except:
currency = ''
price=''

The indentation levels of the try and except need to be matched.
This should fix the indentation error, though I am not sure if it will make the code do what you want:
try:
try:
try:
p = soup.find('span', id='prcIsum').text.strip() ##Rid off the spaces,also split the string by space
except:
p = soup.find('span', id='prcIsum_bidPrice').text.strip()
except:
p = soup.find('span', id='mm-saleDscPrc').text.strip()
currency , price = p.split(' ') ##to get 2 elements
except:
currency = ''
price=''

Related

Python web scraping : how to skip url error

I am trying to scrape a webpage ("coinmarketcap"). I am scraping data from 2013 to 2019 October (Open, High, Low, Close, Marketcap, Volume) of all cryptocurrencies.
for j in range (0,name_size):
url = ("https://coinmarketcap.com/currencies/" + str(name[j]) + "/historical-data/?start=20130429&end=20191016")
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
priceDiv = soup.find('div', attrs={'class':'table-responsive'})
rows = priceDiv.find_all('tr')
The problem is some url doesn't exist. And I don't know how to skip those. Can you please help me?
Use try-except
for j in range (0,name_size):
url = ("https://coinmarketcap.com/currencies/" + str(name[j]) + "/historical-data/?start=20130429&end=20191016")
try:
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
priceDiv = soup.find('div', attrs={'class':'table-responsive'})
except:
print("Coult not open url")
rows = priceDiv.find_all('tr')
use error catching.
try:
#do the thing
except Exception as e:
#here you can print the error
The ones in error will simply be skipped wit the print message, else the task continues

How to exclude stocks based on technicals on python

I have a code that gives me the technicals of stocks from yahoo, no problem with that, but I am trying to get the program to not print stocks if they do not meet requirements, for example, if revenue is not greater than 100B.
I have tried an if statement at various parts of this code, none seem to work.
technicals = {}
try:
url = ('http://finance.yahoo.com/q/ks?s='+stock)
page = urllib2.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
tables = soup.findAll('table', {"class" : 'table-qsp-stats'}) # Found using page inspection
for table in tables:
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
col_name = row.find_all('span') # Use span to avoid supscripts
col_name = [cell.text.strip() for cell in col_name]
col_val = row.find_all('td')
col_val = [cell.text.strip() for cell in col_val]
technicals[col_name[0]] = col_val[1] # col_val[0] is the name cell (with subscript)
return technicals
except Exception as e:
print('Failed, exception: ', str(e))
def scrape(stock_list, interested, technicals):
for each_stock in stock_list:
technicals = scrape_yahoo(each_stock)
if int('Revenue') > 100000000000:
print(each_stock)
for ind in interested:
print(ind + ": "+ technicals[ind])
print("------")
time.sleep(1) # Use delay to avoid getting flagged as bot
return technicals
def main():
stock_list = ['aapl', 'tsla', 'ge']
interested = ['Market Cap (intraday)', 'Return on Equity', 'Revenue', 'Quarterly Revenue Growth']
technicals = {}
tech = scrape(stock_list, interested, technicals)
print(tech)
main()
ValueError: invalid literal for int() with base 10: 'Revenue'
i assume that technical variable is a dict and it has Revenue key.
you should change from
if int('Revenue')
to
if int(technical.get('Revenue',0))
import time
import urllib.request
from bs4 import BeautifulSoup
def scrape_yahoo(stock):
technicals = {}
try:
url = ('http://finance.yahoo.com/q/ks?s= ' +stock)
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
tables = soup.findAll('table', {"class" : 'table-qsp-stats'}) # Found using page inspection
for table in tables:
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
col_name = row.find_all('span') # Use span to avoid supscripts
col_name = [cell.text.strip() for cell in col_name]
col_val = row.find_all('td')
col_val = [cell.text.strip() for cell in col_val]
technicals[col_name[0]] = col_val[1] # col_val[0] is the name cell (with subscript)
return technicals
except Exception as e:
print('Failed, exception: ', str(e))
def scrape(stock_list, interested, technicals):
for each_stock in stock_list:
technicals = scrape_yahoo(each_stock)
if (float(technicals.get('Revenue',0).replace("B","")))*1000000000 > 100000000000:
print(each_stock)
for ind in interested:
print(ind + ": "+ technicals[ind])
print("------")
time.sleep(1) # Use delay to avoid getting flagged as bot
return technicals
def main():
stock_list = ['aapl', 'tsla', 'ge']
interested = ['Market Cap (intraday)', 'Return on Equity', 'Revenue', 'Quarterly Revenue Growth']
technicals = {}
tech = scrape(stock_list, interested, technicals)
print(tech)
main()

im getting connection errors in my python program

u have tried to run my program , but each time im getting error in the middile of the run
basiclly, my program does this :
1. get the xml from my website
2. run all the urls
3. get data from my web page (sku,name,title, price etc)
4. get the lowest price from another website, by compraring the price with the same sku
the problem is that i have more then 7,000 urls in my xml ,so my program get error network each time
what to do ? how can i resolve it ?
def parse_sitemap (url):
resp = requests.get(XXXX)
for u in urls:
loc = u.find ('loc').string
# not a sitemap requirement skip if not present
out.append ([loc])
return out
def get_sku (u):
html = requests.get(u)
bsObj = BeautifulSoup(html.content,'xml')
sku = bsObj.find('span',attrs={'itemprop':'sku'}).get_text()
return sku
def get_price ( u):
try:
html = requests.get(u)
bsObj = BeautifulSoup(html.content,'xml')
price = bsObj.find('span',attrs={'itemprop':'price'}).get_text()
price = str(price).replace(' ₪‎','')
return price
except:
return 'no price'
def get_zapPrice (makat):
try:
check ='https://www.zap.co.il/search.aspx?keyword='+makat
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'html.parser')
zapPrice = bsObj.select_one('div.StoresLines div.PriceNum').text.strip().replace(' ₪','')
return zapPrice
except:
return 'no zap product'
def get_zapStoreName (makat):
try:
check ='https://www.zap.co.il/search.aspx?keyword='+makat
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'html.parser')
storeName = bsObj.select_one('div.StoresLines
div.BuyButtonsTxt').text.strip().replace('ב-','')
return storeName
except:
return 'no zap product'
for u in urls:
ws1 [ 'A1' ] = u
makat = get_sku(u)
ws1 [ 'F1' ] = makat
zapPrice = get_zapPrice(makat)
ws1['I1'] = zapPrice
storeName = get_zapStoreName(makat)
ws1['J1'] = storeName
ws1.insert_rows(1)
ws1.append ([])
print("writing product no." + str(i))
ws1['F1'] = 'makat'
ws1['I1'] = 'zap price'
ws1['J1'] = 'zap store'
wb.save ("sample.xlsx")
wb.close ()
print ('end')
i didn't write all my code - by the basic is here
each def it's start with requests.get, get what i want and return it
after that, i'm writing it to excel file
the problem that im getting after 1,000 urls checks ...
what is the problem ??

Python How to retrieve a stock's last current stock price from the dictionary and put it into a variable?

I am trying to obtain a stock's current price, and then put it into a variable to run if / else statements on. I have used the Google API to retrieve current stock prices, but I am unable to figure out how to put it into a variable. Thanks!
import json
import sys
try:
from urllib.request import Request, urlopen
except ImportError: #python 2
from urllib2 import Request, urlopen
googleFinanceKeyToFullName = {
u'id' : u'ID',
u't' : u'StockSymbol',
u'e' : u'Index',
u'l' : u'LastTradePrice',
u'l_cur' : u'LastTradeWithCurrency',
u'ltt' : u'LastTradeTime',
u'lt_dts' : u'LastTradeDateTime',
u'lt' : u'LastTradeDateTimeLong',
u'div' : u'Dividend',
u'yld' : u'Yield'
}
def buildUrl(symbols):
symbol_list = ','.join([symbol for symbol in symbols])
#a deprecated but still active & correct api
return 'http://finance.google.com/finance/info?client=ig&q=' \
+ symbol_list
def request(symbols):
url = buildUrl(symbols)
req = Request(url)
resp = urlopen(req)
#remove special symbols such as the pound symbol
content = resp.read().decode('ascii', 'ignore').strip()
content = content[3:]
return content
def replaceKeys(quotes):
global googleFinanceKeyToFullName
quotesWithReadableKey = []
for q in quotes:
qReadableKey = {}
for k in googleFinanceKeyToFullName:
if k in q:
qReadableKey[googleFinanceKeyToFullName[k]] = q[k]
quotesWithReadableKey.append(qReadableKey)
return quotesWithReadableKey
def getQuotes(symbols):
if type(symbols) == type('str'):
symbols = [symbols]
content = json.loads(request(symbols))
return replaceKeys(content);
if __name__ == '__main__':
try:
symbols = sys.argv[1]
except:
symbols = "GOOG,AAPL,MSFT,AMZN,SBUX"
symbols = symbols.split(',')
try:
print(json.dumps(getQuotes(symbols), indent=2))
except:
print("Fail")
You can get the last current stock price from the dictionary and put it into a variable, say price,
by changing the last part of the code to
try:
quotes = getQuotes(symbols)
price = quotes[-1]['LastTradePrice'] # -1 means last in a list
print(price)
except Exception as e:
print(e)
but it is very unreliable because if the order of prices is changed, you will get a price for a different stock.
What you should do is to learn how to define a data structure that's suitable ro solve your problem.

Integers, Float and Beautiful Soup Attribute

I'm trying to get my all_data tags into a format where I can do a comparison with them with a Boolean. I think it involves using the float and/or an int operator. However, I have some concerns as the output once the site is scrapped. The output is in integers, decimals and percentages. The specific line I am talking about modifying is line 33. I have tried using int() and .int. I haven't found any questions on Stackoverflow about this or in Beautiful Soup documentation.
from BeautifulSoup import BeautifulSoup
import csv
import re
import urllib
import urllib2
from urllib2 import HTTPError
# import modules
symbolfile = open("symbols.txt")
symbolslist = symbolfile.read()
newsymbolslist = symbolslist.split("\n")
i = 0
f = csv.writer(open("pe_ratio.csv","wb"))
# short cut to write
f.writerow(["Name","PE","Revenue % Quarterly","ROA% YOY","Operating Cashflow","Debt to Equity"])
#first write row statement
# define name_company as the following
while i<len(newsymbolslist):
try:
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
except urllib2.HTTPError:
continue
soup = BeautifulSoup(page)
name_company = soup.findAll("div", {"class" : "title"})
for name in name_company: #add multiple iterations?
all_data = soup.findAll('td', "yfnc_tabledata1")
stock_name = name.find('h2').string #find company's name in name_company with h2 tag
try:
f.writerow([stock_name, all_data[2].getText(),all_data[17].getText(),all_data[13].getText(), all_data[29].getText(),all_data[26].getText()]) #write down PE data
except (IndexError, HTTPError) as e:
pass
i+=1
This is what the output looks like in the CSV file.
Agilent Technologies Inc. (A) 25.7 -2.80% 5.60% N/A 51.03
Please keep in mind you load the stock ticker symbols by putting them in vertically in the symbols.txt file.
To convert your all_data string values to numbers try something like this:
all_data = soup.findAll('td', "yfnc_tabledata1")
stock_name = name.find('h2').string #find company's name in name_company with h2 tag
clean_data = list()
for x in [data.GetText().strip(' %') for data in all_data]
try:
clean_data.append(float(x))
except ValueError:
clean_data.append(x)
try:
f.writerow([stock_name, clean_data[2], clean_data[17], clean_data[13], clean_data[29], clean_data[26]]) #write down PE data
except (IndexError, HTTPError) as e:
pass
If you want to run comparisons on the data (i.e. is quarterly percent greater than 25) you'll have to format the text so it can be converted to a number
quarterly_percent = all_data[17].getText()
if quarterly_percent != "N/A":
#cut off the percent sign and conver to a "python number"
quarterly_percent = float(quarterly_percent[:-1])
if quarterly_percent > 25:
print "its a good one"

Categories

Resources