How can I request the following API with multi-threading in python? Or any ways that can request API more effective? Multi-processing? AsyncIO?
import pandas as pd
import requests
import json
def getAPIreturn(row):
para = row['para']
url = f"http://localhost/search?name={para}"
try:
return_json = json.loads(requests.get(url).json)
data = return_json['data']
score = data[0]['score']
count = data[1]['count']
status = data[2]['status']
except:
score = None
count = None
status = None
row['score'] = score
row['count'] = count
row['status'] = status
return row
data = pd.read_csv("input.csv")
data = data.apply(getAPIreturn, axis=1)
data.to_csv("output.csv", index=False)
Related
i wrote a script to get historical data from the public trades endpoint of the Kraken API, code as follows:
import pandas as pd
import json
import time
import urllib.request
def get_data(pair, since, until):
global data
global query
global json_response
global api_data
data_columns= ["price", "volume", "time", "buy/sell", "market/limit", "miscellaneous"]
data = pd.DataFrame(columns= data_columns)
api_start = since
app_start_time = time.time()
counter = 1
while api_start < until:
last_time = time.time()
api_domain = "https://api.kraken.com/0/public/Trades" + \
"?pair=%(pair)s&since=%(since)s" % {"pair":pair, "since": api_start}
api_request = urllib.request.Request(api_domain)
try:
api_data = urllib.request.urlopen(api_request).read()
except Exception:
time.sleep(3)
api_data = json.loads(api_data)
if len(api_data["error"]) != 0:
print(api_data["error"])
time.sleep(3)
continue
query = pd.DataFrame(api_data["result"][pair], columns = data_columns)
data = data.append(query, ignore_index= True)
api_start = int(api_data["result"]["last"][:10])
counter +=1
time.sleep(1)
print("Request number: %s" %counter)
print("Time since start: %s minutes" % round((time.time() - app_start_time)/60,2))
print("Time since last request: %s seconds" % round((time.time() - last_time),2))
print("last: %s" %api_start)
print("")
get_data("XXBTZUSD", 1414761200, 1455761200)
After some successful responses, i get flawed responses, looking like this:
As you can see, at some point, the UNIX time stamp simply jumps from 142894080.33775 to 1654992002.801943 and thus resulting in wrong data.
Is that a problem with my code or with the API?
Thanks in advance.
Taking the liberty to simplify your code I cannot confirm your observation. I get proper timestamps.
Try this:
import requests
def get_data(pair, since):
url = f"https://api.kraken.com/0/public/Trades?pair={pair}&since={since}"
api_data = requests.get(url)
api_data = json.loads(api_data.content)
return api_data
results = get_data("XBTUSD", 1414761200)
columns= ["price", "volume", "time", "buy/sell", "market/limit", "miscellaneous"]
df = pd.DataFrame(results["result"]["XXBTZUSD"], columns=columns)
df.time = df.time.astype(int)
df.head()
Print out:
price volume time buy/sell market/limit miscellaneous
0 340.09209 0.02722956 1414815678 s m
1 340.15346 0.21604000 1414820304 s m
2 340.00000 0.03395999 1414820304 s m
3 340.00001 0.01000000 1414821818 s l
4 340.00000 0.25668009 1414821818 s l
Edit:
Using pagination I can confirm the jump in timestamps. The problem very likely lies with the API.
def get_data(pair, since):
url = f"https://api.kraken.com/0/public/Trades?pair={pair}&since={since}"
api_data = requests.get(url)
api_data = json.loads(api_data.content)
return api_data
start_ts = 1414761200
frames = []
for _ in range(30):
print(start_ts)
print(datetime.fromtimestamp(int(start_ts)))
tmp = get_data("XBTUSD", start_ts)
start_ts = tmp["result"]["last"][:10]
frames.append(pd.DataFrame(results["result"]["XXBTZUSD"]))
time.sleep(2)
Print out after a couple of iterations:
1438313128
2015-07-31 05:25:28
1653648031
2022-05-27 12:40:31
I have data set of retrieved tweets via the Twitter streaming API.
However, I regularly want to be updated about how the public metrics change. Therefore, I wrote a code to request those public metrics:
def create_url():
tweet_fields = "tweet.fields=public_metrics"
tweets_data_path = 'dataset.txt'
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
df = pd.DataFrame.from_dict(pd.json_normalize(tweets_data), orient='columns')
df_id = (str(str((df['id'].tolist()))[1:-1])).replace(" ", "")
ids = "ids=" + df_id
url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
return url
def bearer_oauth(r):
r.headers["Authorization"] = f"Bearer {'AAAAAAAAAAAAAAAAAAAAAN%2B7QwEAAAAAEG%2BzRZkmZ4HGizsKCG3MkwlaRzY%3DOwuZeaeHbeMM1JDIafd5riA1QdkDabPiELFsguR4Zba9ywzzOQ'}"
r.headers["User-Agent"] = "v2TweetLookupPython"
return r
def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {}".format(
response.status_code, response.text
)
)
return response.json()
def main():
url = create_url()
json_response = connect_to_endpoint(url)
print(json.dumps(json_response, indent=3, sort_keys=True))
if __name__ == "__main__":
main()
Unfortunately, my data set has more than 100 id's in it and I want to retrieve the metrics for all of them. As I can only request 100 id's at a time, can you maybe help me on how to do that?
Also, I would like to make the request daily at midnight and then store the file in a txt document, maybe you can also help me with that?
You can chunk your data and send it in batches using itertools.islice.
test.py:
import reprlib
from itertools import islice
import pandas as pd
BASE_URL = "https://api.twitter.com/2/tweets"
CHUNK = 100
def req(ids):
tmp = reprlib.repr(ids) # Used here just to shorten the output
print(f"{BASE_URL}?ids={tmp}")
def main():
df = pd.DataFrame({"id": range(1000)})
it = iter(df["id"])
while chunk := tuple(islice(it, CHUNK)):
ids = ",".join(map(str, chunk))
req(ids)
if __name__ == "__main__":
main()
Test:
$ python test.py
https://api.twitter.com/2/tweets?ids='0,1,2,3,4,5,...5,96,97,98,99'
https://api.twitter.com/2/tweets?ids='100,101,102,...6,197,198,199'
https://api.twitter.com/2/tweets?ids='200,201,202,...6,297,298,299'
https://api.twitter.com/2/tweets?ids='300,301,302,...6,397,398,399'
https://api.twitter.com/2/tweets?ids='400,401,402,...6,497,498,499'
https://api.twitter.com/2/tweets?ids='500,501,502,...6,597,598,599'
https://api.twitter.com/2/tweets?ids='600,601,602,...6,697,698,699'
https://api.twitter.com/2/tweets?ids='700,701,702,...6,797,798,799'
https://api.twitter.com/2/tweets?ids='800,801,802,...6,897,898,899'
https://api.twitter.com/2/tweets?ids='900,901,902,...6,997,998,999'
Note: You'll make multiple requests with this approach so keep in mind any rate limits.
I'm currently trying to put two things together when checking multiple websites from my input CSV file:
Check HTTP status
Check if Website displays specific keyword
then save the results to a new CSV file.
My input.csv:
id url
1 https://example123.com
2 https://envato.com/blog/30-outstanding-coming-soon-and-under-construction-website-templates/
3 https://mundoshoponline.com
My Code:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import asyncio
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
df = pd.read_csv('path/to/my/input.csv')
#my csv has urls in the 1st column
urls = df.T.values.tolist()[1]
results = {}
status = []
async def scrape(url):
try:
r = requests.get(url, timeout=(3, 6))
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
#all keywords to check on the website
data = {
"coming soon": soup.body.findAll(text = re.compile("coming soon", re.I)),
"Opening Soon": soup.body.findAll(text = re.compile("Opening Soon", re.I)),
"Forbidden": soup.body.findAll(text = re.compile("Forbidden", re.I)),
"Page not found": soup.body.findAll(text = re.compile("Page not found", re.I)),
"Under Construction": soup.body.findAll(text = re.compile("Under Construction", re.I)),
"Currently Unavailable": soup.body.findAll(text = re.compile("Currently Unavailable", re.I))}
results[url] = data
#check for http status and save to status list
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
status.append("Down")
except requests.exceptions.HTTPError:
status.append("Other")
else:
status.append("OK")
async def main():
await asyncio.wait([scrape(url) for url in urls])
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
comingList= []
openingList = []
forbiddenList= []
notfoundList = []
underList = []
currentlyList = []
#mark x if there are any hits for specific keyword
for url in results:
comingList.append("x" if len(results[url]["coming soon"]) > 0 else "")
openingList.append("x" if len(results[url]["Opening Soon"]) > 0 else "")
forbiddenList.append("x" if len(results[url]["Forbidden"]) > 0 else "")
notfoundList.append("x" if len(results[url]["Page not found"]) > 0 else "")
underList.append("x" if len(results[url]["Under Construction"]) > 0 else "")
currentlyList.append("x" if len(results[url]["Currently Unavailable"]) > 0 else "")
df["comingSoon"] = pd.DataFrame(comingList, columns=['comingSoon'])
df["openingSoon"] = pd.DataFrame(openingList, columns=['openingSoon'])
df["forbidden"] = pd.DataFrame(forbiddenList, columns=['forbidden'])
df["notfound2"] = pd.DataFrame(notfoundList, columns=['notfound2'])
df["underConstruction"] = pd.DataFrame(underList, columns=['underConstruction'])
df["currentlyUnavailable"] = pd.DataFrame(currentlyList, columns=['currentlyUnavailable'])
df['status'] = status
print(df)
df.to_csv('path/to/my/output.csv', index=False)
However, whenever I run the above script with for url in urls:
for some of my urls it throws this error and script breaks and output.csv is not generated:
Traceback (most recent call last):
File "path/to/myscan.py", line 51, in <module>
comingList.append("x" if len(results[url]["coming soon"]) > 0 else "")
KeyError: 'http://example123.com'
and when running it with for url in results: output.csv is as follows:
[![enter image description here][1]][1]
seems erroneous as first row has keywords marked as present (comingSoon, underConstruction columns) + status column = Down. But website doesn't contain 'coming soon' or 'under construction' strings.
Would someone be able to help me with this? I believe there might be an issue in my loop or try/except part of the code. I'm happy to provide more information if the above is not sufficient. Thank you in advance.
I think your main problem is that you are iterating over the whole urls which some of which may have failed and therefore does not exist in your results as a key.
A much safer way to do this is to iterate over the subset of urls that you are sure have succeeded and have a key in results, so instead of
for url in urls:
you could make it
for url in results:
To make the final results consistent with the input order of your urls:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import asyncio
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
df = pd.read_csv('./input.csv')
#my csv has urls in the 4th column
urls = [ 'example123.com', 'https://envato.com/blog/30-outstanding-coming-soon-and-under-construction-website-templates/', 'http://alotechgear.com']
results = {}
status = {}
async def scrape(url):
try:
r = requests.get(url, timeout=(3, 6))
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
#all keywords to check on the website
data = {
"coming soon": soup.body.findAll(text = re.compile("coming soon", re.I)),
"Opening Soon": soup.body.findAll(text = re.compile("Opening Soon", re.I)),
"Forbidden": soup.body.findAll(text = re.compile("Forbidden", re.I)),
"Page not found": soup.body.findAll(text = re.compile("Page not found", re.I)),
"Under Construction": soup.body.findAll(text = re.compile("Under Construction", re.I)),
"Currently Unavailable": soup.body.findAll(text = re.compile("Currently Unavailable", re.I))}
results[url] = data
#check for http status and save to status list
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, requests.exceptions.MissingSchema):
status[url] = "Down"
except requests.exceptions.HTTPError:
status[url] = "Other"
else:
status[url] = "OK"
async def main():
await asyncio.wait([scrape(url) for url in urls])
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
comingList= []
openingList = []
forbiddenList= []
notfoundList = []
underList = []
currentlyList = []
statusList = []
#mark x if there are any hits for specific keyword
for url in urls:
if(not results.get(url)):
statusList.append(status.get(url))
notfoundList.append("x")
comingList.append("-")
openingList.append("-")
forbiddenList.append("-")
underList.append("-")
currentlyList.append("-")
else:
statusList.append(status.get(url))
comingList.append("x" if len(results[url].get("coming soon")) > 0 else "-")
openingList.append("x" if len(results[url].get("Opening Soon")) > 0 else "-")
forbiddenList.append("x" if len(results[url].get("Forbidden")) > 0 else "-")
notfoundList.append("x" if len(results[url].get("Page not found")) > 0 else "-")
underList.append("x" if len(results[url].get("Under Construction")) > 0 else "-")
currentlyList.append("x" if len(results[url].get("Currently Unavailable")) > 0 else "-")
df["comingSoon"] = pd.DataFrame(comingList, columns=['comingSoon'])
df["openingSoon"] = pd.DataFrame(openingList, columns=['openingSoon'])
df["forbidden"] = pd.DataFrame(forbiddenList, columns=['forbidden'])
df["notfound2"] = pd.DataFrame(notfoundList, columns=['notfound2'])
df["underConstruction"] = pd.DataFrame(underList, columns=['underConstruction'])
df["currentlyUnavailable"] = pd.DataFrame(currentlyList, columns=['currentlyUnavailable'])
df['status'] = pd.DataFrame(statusList, columns=['Status'])
print(df)
df.to_csv('./output.csv', index=False)
sample result:
id url comingSoon openingSoon forbidden notfound2 underConstruction currentlyUnavailable status
0 1 https://example123.com - - - x - - Down
1 2 https://envato.com/blog/30-outstanding-c... x - - - x - OK
2 3 https://mundoshoponline.com - - - x - - Down
I am following this tutorial to retrieve data from news sites.
The main function is getDailyNews. It will loop on each news source, request the api, extract the data and dump it to a pandas DataFrame and then export the result into csv file.
But when I ran the code, I am getting an error.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from tqdm import tqdm, tqdm_notebook
from functools import reduce
def getSources():
source_url = 'https://newsapi.org/v1/sources?language=en'
response = requests.get(source_url).json()
sources = []
for source in response['sources']:
sources.append(source['id'])
return sources
def mapping():
d = {}
response = requests.get('https://newsapi.org/v1/sources?language=en')
response = response.json()
for s in response['sources']:
d[s['id']] = s['category']
return d
def category(source, m):
try:
return m[source]
except:
return 'NC'
def getDailyNews():
sources = getSources()
key = '96f279e1b7f845669089abc016e915cc'
url = 'https://newsapi.org/v1/articles?source={0}&sortBy={1}&apiKey={2}'
responses = []
for i, source in tqdm_notebook(enumerate(sources), total=len(sources)):
try:
u = url.format(source, 'top', key)
except:
u = url.format(source, 'latest', key)
response = requests.get(u)
r = response.json()
try:
for article in r['articles']:
article['source'] = source
responses.append(r)
except:
print('Rate limit exceeded ... please wait and retry in 6 hours')
return None
articles = list(map(lambda r: r['articles'], responses))
articles = list(reduce(lambda x,y: x+y, articles))
news = pd.DataFrame(articles)
news = news.dropna()
news = news.drop_duplicates()
news.reset_index(inplace=True, drop=True)
d = mapping()
news['category'] = news['source'].map(lambda s: category(s, d))
news['scraping_date'] = datetime.now()
try:
aux = pd.read_csv('./data/news.csv')
aux = aux.append(news)
aux = aux.drop_duplicates('url')
aux.reset_index(inplace=True, drop=True)
aux.to_csv('./data/news.csv', encoding='utf-8', index=False)
except:
news.to_csv('./data/news.csv', index=False, encoding='utf-8')
print('Done')
if __name__=='__main__':
getDailyNews()
Error:
FileNotFoundError: [Errno 2] No such file or directory: './data/news.csv'
I know that I have to give the path name in pd.read_csv but I don't know which path I have to give here.
This error would make sense if there wasn't already a data folder in the directory you are executing this program from. There is a similar problem in the post here.
Trying to search to see if quantity in stock is above 0 on:
https://www.astro-bot.io/api/commerce/inventory/stock/?crumb=BQXy6KNmMGWENWE3YWQzMTc3MDU5NzE1ODdkNDNiM2RmZTEyNjg4&itemId=5b0dc9172b6a283905dabe08
getting:
TypeError: 'int' object is not subscriptable
Code:
import requests
from bs4 import BeautifulSoup as bs4
from threading import Thread
import json
s = requests.session()
def checkstock():
global session
response = s.get('https://www.astro-bot.io/api/commerce/inventory/stock/?crumb=BQXy6KNmMGWENWE3YWQzMTc3MDU5NzE1ODdkNDNiM2RmZTEyNjg4&itemId=5b0dc9172b6a283905dabe08')
data = json.loads(response.text)
stock = data['results'][0]['qtyInStock'][0]['available']
if stock in [0]:
print("out of stock!")
else:
print("in stock")
checkstock()
data = json.loads(response.text)
stock = data['results'][0]['qtyInStock'][0]['available']
This should be
data = json.loads(response.text)
stock = data['results'][0]['qtyInStock']
You need to loop through your results as there may be more than on record. requests already provides an easy way to convert stuff to json.
A simpler way to write your code is:
def checkstock():
global session
response = s.get('https://www.astro-bot.io/api/commerce/inventory/stock/?crumb=BQXy6KNmMGWENWE3YWQzMTc3MDU5NzE1ODdkNDNiM2RmZTEyNjg4&itemId=5b0dc9172b6a283905dabe08')
data = response.json()
for results in data:
if results['qtyInStock'] == 0:
print('Out of stock')
else:
print('Available')
def checkstock():
global session
response = s.get('https://www.astro-bot.io/api/commerce/inventory/stock/?crumb=BQXy6KNmMGWENWE3YWQzMTc3MDU5NzE1ODdkNDNiM2RmZTEyNjg4&itemId
=5b0dc9172b6a283905dabe08')
data = json.loads(response.text)
results = data.get('results',[{}])[0]
try:
qtyInStock = [{}] if (results['qtyInStock'] == 0) else results['qtyInStock']
except TypeError:
qtyInStock = [{}]
stock = qtyInStock[0].get('available',0)
print('in stock') if (stock != 0) else print('out of stock!')