I doing crawl data from Yahoo Financail. I have searched link:
https://query1.finance.yahoo.com/v7/finance/download/BVH?period1=923729900&period2=1618039708&interval=1d&events=history&includeAdjustedClose=true.
def createLink(symbol,table):
s = "https://query1.finance.yahoo.com/v7/finance/download/BVH?period1=923729900&period2=1618039708&interval=1d&events=history&includeAdjustedClose=true"
return s.replace("BVH",symbol).replace("history",table)
def getData(symbol,table):
URL = createLink(symbol,table)
web = requests.get(URL)
if web.status_code == 200:
reader = pd.read_csv(URL)
else:
reader = pd.DataFrame({"Data":[],"Dividends":[],"Stock Splits":[]})
return reader
def history(symbol):
history_close = getData(symbol,'history')
if history_close.empty:
return history_close
divend = getData(symbol,'div')
stock = getData(symbol,'split')
x = pd.merge(divend,stock, how="outer", on="Date")
data = pd.merge(history_close,x, how="outer", on="Date")
return data
df = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/symbolNYSE.xlsx")
count = 0
count_fail = 0
for i in range(0,len(df["Symbol"])):
try:
count += 1
print(df["Symbol"][i],count)
a = history(df["Symbol"][i])
if not a.empty:
a.to_excel("/content/drive/MyDrive/ColabNotebooks/GetCloseYahoo/"+df["Symbol"][i]+".xlsx")
except:
count_fail+=1
pass
print("success:", count)
print("fail:", count_fail)
I am using python, request, pandas on Jupiter to crawl it.
The errors:
Error tokenizing data. C error: Expected 2 fields in line 3, saw 12
Key error
Start, i can crawl about 100 - 200 company. Then the program will error by any Symbol company. Finally, i wait a minute i can run repeat it, the program is not error.
What is the reason? Thank you so much.
Related
i wrote a script to get historical data from the public trades endpoint of the Kraken API, code as follows:
import pandas as pd
import json
import time
import urllib.request
def get_data(pair, since, until):
global data
global query
global json_response
global api_data
data_columns= ["price", "volume", "time", "buy/sell", "market/limit", "miscellaneous"]
data = pd.DataFrame(columns= data_columns)
api_start = since
app_start_time = time.time()
counter = 1
while api_start < until:
last_time = time.time()
api_domain = "https://api.kraken.com/0/public/Trades" + \
"?pair=%(pair)s&since=%(since)s" % {"pair":pair, "since": api_start}
api_request = urllib.request.Request(api_domain)
try:
api_data = urllib.request.urlopen(api_request).read()
except Exception:
time.sleep(3)
api_data = json.loads(api_data)
if len(api_data["error"]) != 0:
print(api_data["error"])
time.sleep(3)
continue
query = pd.DataFrame(api_data["result"][pair], columns = data_columns)
data = data.append(query, ignore_index= True)
api_start = int(api_data["result"]["last"][:10])
counter +=1
time.sleep(1)
print("Request number: %s" %counter)
print("Time since start: %s minutes" % round((time.time() - app_start_time)/60,2))
print("Time since last request: %s seconds" % round((time.time() - last_time),2))
print("last: %s" %api_start)
print("")
get_data("XXBTZUSD", 1414761200, 1455761200)
After some successful responses, i get flawed responses, looking like this:
As you can see, at some point, the UNIX time stamp simply jumps from 142894080.33775 to 1654992002.801943 and thus resulting in wrong data.
Is that a problem with my code or with the API?
Thanks in advance.
Taking the liberty to simplify your code I cannot confirm your observation. I get proper timestamps.
Try this:
import requests
def get_data(pair, since):
url = f"https://api.kraken.com/0/public/Trades?pair={pair}&since={since}"
api_data = requests.get(url)
api_data = json.loads(api_data.content)
return api_data
results = get_data("XBTUSD", 1414761200)
columns= ["price", "volume", "time", "buy/sell", "market/limit", "miscellaneous"]
df = pd.DataFrame(results["result"]["XXBTZUSD"], columns=columns)
df.time = df.time.astype(int)
df.head()
Print out:
price volume time buy/sell market/limit miscellaneous
0 340.09209 0.02722956 1414815678 s m
1 340.15346 0.21604000 1414820304 s m
2 340.00000 0.03395999 1414820304 s m
3 340.00001 0.01000000 1414821818 s l
4 340.00000 0.25668009 1414821818 s l
Edit:
Using pagination I can confirm the jump in timestamps. The problem very likely lies with the API.
def get_data(pair, since):
url = f"https://api.kraken.com/0/public/Trades?pair={pair}&since={since}"
api_data = requests.get(url)
api_data = json.loads(api_data.content)
return api_data
start_ts = 1414761200
frames = []
for _ in range(30):
print(start_ts)
print(datetime.fromtimestamp(int(start_ts)))
tmp = get_data("XBTUSD", start_ts)
start_ts = tmp["result"]["last"][:10]
frames.append(pd.DataFrame(results["result"]["XXBTZUSD"]))
time.sleep(2)
Print out after a couple of iterations:
1438313128
2015-07-31 05:25:28
1653648031
2022-05-27 12:40:31
I'm using a medium API to get a some information but after some API calls the python script ended with this error:
IndexError: list index out of range
Here is my Python code:
def get_post_responses(posts):
#start = time.time()
count = 0
print('Retrieving the post responses...')
responses = []
for post in posts:
url = MEDIUM + '/_/api/posts/' + post + '/responses'
count = count + 1
print("number of times api called",count)
response = requests.get(url)
response_dict = clean_json_response(response)
responses += response_dict['payload']['value']
#end = time.time()
#four = end - start
#global time_cal
#time_cal.append(four)
return responses
def check_if_high_recommends(response, recommend_min):
if response['virtuals']['recommends'] >= recommend_min:
return True
def check_if_recent(response):
limit_date = datetime.now() - timedelta(days=360)
creation_epoch_time = response['createdAt'] / 1000
creation_date = datetime.fromtimestamp(creation_epoch_time)
if creation_date >= limit_date:
return True
It needs to work for more then 10000 followers for a user.
i got an ans for my question...
just i need to use try catch exception ...
response_dict = clean_json_response(response)
try:
responses += response_dict['payload']['value']
catch:
continue
u have tried to run my program , but each time im getting error in the middile of the run
basiclly, my program does this :
1. get the xml from my website
2. run all the urls
3. get data from my web page (sku,name,title, price etc)
4. get the lowest price from another website, by compraring the price with the same sku
the problem is that i have more then 7,000 urls in my xml ,so my program get error network each time
what to do ? how can i resolve it ?
def parse_sitemap (url):
resp = requests.get(XXXX)
for u in urls:
loc = u.find ('loc').string
# not a sitemap requirement skip if not present
out.append ([loc])
return out
def get_sku (u):
html = requests.get(u)
bsObj = BeautifulSoup(html.content,'xml')
sku = bsObj.find('span',attrs={'itemprop':'sku'}).get_text()
return sku
def get_price ( u):
try:
html = requests.get(u)
bsObj = BeautifulSoup(html.content,'xml')
price = bsObj.find('span',attrs={'itemprop':'price'}).get_text()
price = str(price).replace(' ₪','')
return price
except:
return 'no price'
def get_zapPrice (makat):
try:
check ='https://www.zap.co.il/search.aspx?keyword='+makat
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'html.parser')
zapPrice = bsObj.select_one('div.StoresLines div.PriceNum').text.strip().replace(' ₪','')
return zapPrice
except:
return 'no zap product'
def get_zapStoreName (makat):
try:
check ='https://www.zap.co.il/search.aspx?keyword='+makat
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'html.parser')
storeName = bsObj.select_one('div.StoresLines
div.BuyButtonsTxt').text.strip().replace('ב-','')
return storeName
except:
return 'no zap product'
for u in urls:
ws1 [ 'A1' ] = u
makat = get_sku(u)
ws1 [ 'F1' ] = makat
zapPrice = get_zapPrice(makat)
ws1['I1'] = zapPrice
storeName = get_zapStoreName(makat)
ws1['J1'] = storeName
ws1.insert_rows(1)
ws1.append ([])
print("writing product no." + str(i))
ws1['F1'] = 'makat'
ws1['I1'] = 'zap price'
ws1['J1'] = 'zap store'
wb.save ("sample.xlsx")
wb.close ()
print ('end')
i didn't write all my code - by the basic is here
each def it's start with requests.get, get what i want and return it
after that, i'm writing it to excel file
the problem that im getting after 1,000 urls checks ...
what is the problem ??
I am trying to read the text data from the Url mentioned in the code. But it throws an error:
ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 2
url="https://cdn.upgrad.com/UpGrad/temp/d934844e-5182-4b58-b896-4ba2a499aa57/companies.txt"
c=pd.read_csv(url, encoding='utf-8')
Seems like there was some encoding issues with df.read_csv() it never splitted the code:
#!/usr/bin/env python3
import requests
import pandas as pd
url = "https://cdn.upgrad.com/UpGrad/temp/d934844e-5182-4b58-b896-4ba2a499aa57/companies.txt"
r = requests.get(url)
df = None
if r.status_code == 200:
rows = r.text.split('\r\n')
header = rows[0].split('\t')
data = []
for n in range(1, len(rows)):
cols = rows[n].split('\t')
data.append(cols)
df = pd.DataFrame(columns=header, data=data)
else:
print("error: unable to load {}".format(url))
sys.exit(-1)
print(df.shape)
print(df.head(2))
$ ./test.py
(66369, 10)
permalink name homepage_url category_list status country_code state_code region city founded_at
0 /Organization/-Fame #fame http://livfame.com Media operating IND 16 Mumbai Mumbai
1 /Organization/-Qounter :Qounter http://www.qounter.com Application Platforms|Real Time|Social Network... operating USA DE DE - Other Delaware City 04-09-2014
I am following this tutorial to retrieve data from news sites.
The main function is getDailyNews. It will loop on each news source, request the api, extract the data and dump it to a pandas DataFrame and then export the result into csv file.
But when I ran the code, I am getting an error.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from tqdm import tqdm, tqdm_notebook
from functools import reduce
def getSources():
source_url = 'https://newsapi.org/v1/sources?language=en'
response = requests.get(source_url).json()
sources = []
for source in response['sources']:
sources.append(source['id'])
return sources
def mapping():
d = {}
response = requests.get('https://newsapi.org/v1/sources?language=en')
response = response.json()
for s in response['sources']:
d[s['id']] = s['category']
return d
def category(source, m):
try:
return m[source]
except:
return 'NC'
def getDailyNews():
sources = getSources()
key = '96f279e1b7f845669089abc016e915cc'
url = 'https://newsapi.org/v1/articles?source={0}&sortBy={1}&apiKey={2}'
responses = []
for i, source in tqdm_notebook(enumerate(sources), total=len(sources)):
try:
u = url.format(source, 'top', key)
except:
u = url.format(source, 'latest', key)
response = requests.get(u)
r = response.json()
try:
for article in r['articles']:
article['source'] = source
responses.append(r)
except:
print('Rate limit exceeded ... please wait and retry in 6 hours')
return None
articles = list(map(lambda r: r['articles'], responses))
articles = list(reduce(lambda x,y: x+y, articles))
news = pd.DataFrame(articles)
news = news.dropna()
news = news.drop_duplicates()
news.reset_index(inplace=True, drop=True)
d = mapping()
news['category'] = news['source'].map(lambda s: category(s, d))
news['scraping_date'] = datetime.now()
try:
aux = pd.read_csv('./data/news.csv')
aux = aux.append(news)
aux = aux.drop_duplicates('url')
aux.reset_index(inplace=True, drop=True)
aux.to_csv('./data/news.csv', encoding='utf-8', index=False)
except:
news.to_csv('./data/news.csv', index=False, encoding='utf-8')
print('Done')
if __name__=='__main__':
getDailyNews()
Error:
FileNotFoundError: [Errno 2] No such file or directory: './data/news.csv'
I know that I have to give the path name in pd.read_csv but I don't know which path I have to give here.
This error would make sense if there wasn't already a data folder in the directory you are executing this program from. There is a similar problem in the post here.