Apply the code for smaller batches in the data set sequentially - python

I have data set of retrieved tweets via the Twitter streaming API.
However, I regularly want to be updated about how the public metrics change. Therefore, I wrote a code to request those public metrics:
def create_url():
tweet_fields = "tweet.fields=public_metrics"
tweets_data_path = 'dataset.txt'
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
df = pd.DataFrame.from_dict(pd.json_normalize(tweets_data), orient='columns')
df_id = (str(str((df['id'].tolist()))[1:-1])).replace(" ", "")
ids = "ids=" + df_id
url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
return url
def bearer_oauth(r):
r.headers["Authorization"] = f"Bearer {'AAAAAAAAAAAAAAAAAAAAAN%2B7QwEAAAAAEG%2BzRZkmZ4HGizsKCG3MkwlaRzY%3DOwuZeaeHbeMM1JDIafd5riA1QdkDabPiELFsguR4Zba9ywzzOQ'}"
r.headers["User-Agent"] = "v2TweetLookupPython"
return r
def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {}".format(
response.status_code, response.text
)
)
return response.json()
def main():
url = create_url()
json_response = connect_to_endpoint(url)
print(json.dumps(json_response, indent=3, sort_keys=True))
if __name__ == "__main__":
main()
Unfortunately, my data set has more than 100 id's in it and I want to retrieve the metrics for all of them. As I can only request 100 id's at a time, can you maybe help me on how to do that?
Also, I would like to make the request daily at midnight and then store the file in a txt document, maybe you can also help me with that?

You can chunk your data and send it in batches using itertools.islice.
test.py:
import reprlib
from itertools import islice
import pandas as pd
BASE_URL = "https://api.twitter.com/2/tweets"
CHUNK = 100
def req(ids):
tmp = reprlib.repr(ids) # Used here just to shorten the output
print(f"{BASE_URL}?ids={tmp}")
def main():
df = pd.DataFrame({"id": range(1000)})
it = iter(df["id"])
while chunk := tuple(islice(it, CHUNK)):
ids = ",".join(map(str, chunk))
req(ids)
if __name__ == "__main__":
main()
Test:
$ python test.py
https://api.twitter.com/2/tweets?ids='0,1,2,3,4,5,...5,96,97,98,99'
https://api.twitter.com/2/tweets?ids='100,101,102,...6,197,198,199'
https://api.twitter.com/2/tweets?ids='200,201,202,...6,297,298,299'
https://api.twitter.com/2/tweets?ids='300,301,302,...6,397,398,399'
https://api.twitter.com/2/tweets?ids='400,401,402,...6,497,498,499'
https://api.twitter.com/2/tweets?ids='500,501,502,...6,597,598,599'
https://api.twitter.com/2/tweets?ids='600,601,602,...6,697,698,699'
https://api.twitter.com/2/tweets?ids='700,701,702,...6,797,798,799'
https://api.twitter.com/2/tweets?ids='800,801,802,...6,897,898,899'
https://api.twitter.com/2/tweets?ids='900,901,902,...6,997,998,999'
Note: You'll make multiple requests with this approach so keep in mind any rate limits.

Related

Pagination for Twitter API in Python

I am using the full archive search of the Twitter API to extract data on past events. I have downloaded the code sample and modified it a bit to also save my data to a file on my local drive, and this is all working well. But I do not know how to implement pagination.
When working with tweepy, there is a special .pages() function, but my current script uses requests.
I tried adding a while loop in my main function using ["next_token"], but I did not really understand the Twitter documentation and could not make it work.
Here is what I have got so far:
# Extended script for full archive search with Twitter academic API
# based on a sample provided by Twitter
# for documentation, see https://developer.twitter.com/en/products/twitter-api/academic-research
import requests
import os
import json
# STEP 1: add bearer token for your academic Twitter API dev account
bearer_token = "MYTOKEN"
# STEP 2: define which API endpoint to query: "all" or "recent"
search_url = "https://api.twitter.com/2/tweets/search/all"
# Optional params:
# start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
# STEP 3: define query parameters
query_params = {'query': '#WEURO2022',
'tweet.fields': 'author_id,conversation_id,created_at',
'expansions': 'geo.place_id',
'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type',
'user.fields': 'created_at,description,entities,id,location,name',
'start_time': '2022-02-15T00:00:01.000Z',
'end_time': '2022-09-16T23:59:59.000Z',
'max_results':'500'}
def bearer_oauth(r):
"""
Method required by bearer token authentication.
"""
r.headers["Authorization"] = f"Bearer {bearer_token}"
r.headers["User-Agent"] = "v2RecentSearchPython"
return r
def connect_to_endpoint(url, params):
response = requests.get(url, auth=bearer_oauth, params=params)
if response.status_code == 200:
print("Ready to go!")
if response.status_code != 200:
raise Exception(response.status_code, response.text)
json_data=response.json()
# return json_data
# write data to JSON file
with open('C:\\Users\\####\\Downloads\\MyTweets.json', 'a') as json_f:
json.dump(json_data, json_f)
print("JSON data written to file!")
def main():
json_response = connect_to_endpoint(search_url, query_params)
while json_response["meta"]["next_token"]:
query_params["next_token"] = json_response["meta"]["next_token"]
if __name__ == "__main__":
main()
Can you help me fix this, or point me to a tutorial for less experienced users?
I have found a way to fix my pagination issue, but I dare say that it is a most un-elegant solution:
# Extended script for full archive search with Twitter academic API
# based on a sample provided by Twitter
# for documentation, see https://developer.twitter.com/en/products/twitter-api/academic-research
import requests
import os
import json
import urllib
# STEP 1: add bearer token for your academic Twitter API dev account
bearer_token = "MYTOKEN"
# STEP 2: define which API endpoint to query: "all" or "recent"
search_url = "https://api.twitter.com/2/tweets/search/all"
token_url= "https://api.twitter.com/2/tweets/search/all?next_token="
# Optional params:
# start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
# STEP 3: define query parameters and number of pages to be retrieved
# query example: (from:twitterdev -is:retweet) OR #twitterdev
query_params = {'query': '((from:Eurovision) OR #esc OR #esc2018) (#ISR OR #NettaBarzilai OR #NettaBarzilai) lang:en',
'tweet.fields': 'author_id,conversation_id,created_at',
'expansions': 'geo.place_id',
'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type',
'user.fields': 'created_at,description,entities,id,location,name',
'start_time': '2018-02-15T00:00:01.000Z',
'end_time': '2018-07-16T23:59:59.000Z',
'max_results':'500'}
pages = 20
token_list=[]
# STEP 4: authenticate
def bearer_oauth(r):
"""
Method required by bearer token authentication.
"""
r.headers["Authorization"] = f"Bearer {bearer_token}"
r.headers["User-Agent"] = "v2RecentSearchPython"
return r
# STEP 5: connect to endpoint and run query
def connect_to_endpoint(url, params, next_token):
try:
if (len(token_list[-1]) >= 1):
next_token=token_list[-1]
target=[token_url, str(next_token)]
url="".join(target)
print(url)
else:
url = search_url
print(url)
except IndexError:
url = search_url
print(url)
response = requests.get(url, auth=bearer_oauth, params=params)
if response.status_code == 200:
print("Ready to go!")
if response.status_code != 200:
raise Exception(response.status_code, response.text)
json_data=response.json()
next_token=json_data["meta"]["next_token"]
token_list.append(next_token)
print(token_list)
# STEP 6: write data to JSON file
with open('C:\\Users\\xxx\\yyy\\NettaESC2018_tweets.json', 'a') as json_f:
json.dump(json_data, json_f)
print("JSON data written to file!")
def main():
for p in range(0, pages):
try:
json_response = connect_to_endpoint(url, query_params, next_token)
except KeyError:
print("No more tweets found!")
break
if __name__ == "__main__":
main()
If anyone has a better suggestion, I am looking forward to it!

Issue with Python - KeyError

I'm trying to run a Python code where I'm calling an API to extract the data and upload to a CSV file. The CSV file is extracting fine, but throws an error. Can someone please let me know what I might be dong wrong here?
Error Message:
Code:
import http.client
import json
import csv
import os
conn = http.client.HTTPSConnection("api.betterimpact.com")
conn1 = http.client.HTTPSConnection("api.betterimpact.com")
if os.path.exists("CSVVolunteerOutput.csv"):
os.remove("CSVVolunteerOutput.csv")
headers = {
'Authorization': 'Basic XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
'Cookie': '; TrackingClientId=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
}
conn.request("GET", "/v1/enterprise/users/?volunteer_status=accepted&include_custom_fields=false&include_qualifications=false&page_Size=250&include_verified_volunteers_background_check_results=false", headers=headers)
res = conn.getresponse()
data = json.load(res)
conn1.request("GET", "/v1/enterprise/users/?volunteer_status=accepted&include_custom_fields=false&include_qualifications=false&page_Size=250&include_verified_volunteers_background_check_results=false&page_number=0", headers = headers)
res1 = conn1.getresponse()
data1 = json.load(res1)
if data == None or data == "" or len(data) == 0:
print("Check API Credentials..")
exit()
volunteer_status = "Accepted"
pageNum = 0
_page_count = data1['header']['page_count']
while True:
pageNum+=1
with open('CSVVolunteerOutput.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(["SyncID", "FirstName", "LastName", "Jobtitle", "Division", "BusinessUnit", "RegionArea", "WorkLocation", "Department", "WorkEmailAddressPrimary",
"PersonalEmailAddress", "PersonalMobilePhonePrimary", "WorkCountry"])
for user in data['users']:
_id = user['user_id']
_firstName = user['first_name']
_surName = user['last_name']
_emailAddress = user['email_address']
_emailAddressSec = user['secondary_email_address']
_cellPhone = user['cell_phone']
_country = user['country']
for details in user['memberships']:
_orgName = details['organization_name']
_volunteerStatus = details['volunteer_status']
if volunteer_status == _volunteerStatus:
writer.writerow([_id, _firstName, _surName, "Volunteer", "", "", "", _orgName, "", _emailAddress,
_emailAddressSec, _cellPhone, _country])
if pageNum > int(_page_count):
break
else:
conn.request("GET", "/v1/enterprise/users/?volunteer_status=accepted&include_custom_fields=false&include_qualifications=false&page_Size=250&include_verified_volunteers_background_check_results=false&page_number="+str(pageNum), headers=headers)
res = conn.getresponse()
data = json.load(res)
print("CSV file created successfully")
API Documentation is here: https://www.betterimpact.com/volunteer-impact-help/it-api/
Thanks.
I can't run code so I'm guessing.
You have data = ... in two places:
before while-loop
inside while-loop
like this
# --- first `data` ---
conn.request(...)
res = conn.getresponse()
data = json.load(res)
# ... code ...
while True:
# ... code ...
for user in data['users']:
# ... code ...
if pageNum > int(_page_count):
break
else:
# --- second `data` ---
conn.request(...)
res = conn.getresponse()
data = json.load(res)
It seems you checked users in data only after first data = ... but you didn't check it for second data = ... but it can gives you data without users.
You could check both in one place
if "users" in data:
for user in data['users']:
# ... code ...
By the way: if you want to append to file in loop then better append headers before loop. I current version you add headers before every row with data. OR you could append all data to list and after loop write all at once

Call API for each element in list

I have a list with over 1000 IDs and I want to call an API with different endpoints for every element of the list.
Example:
customerlist = [803818, 803808, 803803,803738,803730]
I tried the following:
import json
import requests
import pandas as pd
API_BASEURL = "https://exampleurl.com/"
API_TOKEN = "abc"
HEADERS = {'content-type' : 'application/json',
'Authorization': API_TOKEN }
def get_data(endpoint):
for i in customerlist:
api_endpoint = endpoint
params = {'customerid' : i}
response = requests.get(f"{API_BASEURL}/{api_endpoint}",
params = params,
headers = HEADERS)
if response.status_code == 200:
res = json.loads(response.text)
else:
raise Exception(f'API error with status code {response.status_code}')
res= pd.DataFrame([res])
return res
get_data(endpointexample)
This works, but it only returns the values for the first element of the list (803818). I want the function to return the values for every ID from customerlist for the endpoint I defined in the function argument.
I found this - possibly related - question, but I couldn't figure my problem out.
There is probably an easy solution for this which I am not seeing, as I am just starting with Python. Thanks.
The moment a function hits a return statement, it immediately finishes. Since your return statement is in the loop, the other iterations never actually get called.
To fix, you can create a list outside the loop, append to it every loop iteration, and then return the DataFrame created with that list:
def get_data(endpoint):
responses = []
for i in customerlist:
api_endpoint = endpoint
params = {'customerid' : i}
response = requests.get(f"{API_BASEURL}/{api_endpoint}",
params = params,
headers = HEADERS)
if response.status_code == 200:
res = json.loads(response.text)
else:
raise Exception(f'API error with status code {response.status_code}')
responses.append(res)
return pd.DataFrame(responses)
A much cleaner solution would be to use list comprehension:
def get_data(endpoint, i):
api_endpoint = endpoint
params = {'customerid' : i}
response = requests.get(f"{API_BASEURL}/{api_endpoint}",
params = params,
headers = HEADERS)
if response.status_code == 200:
res = json.loads(response.text)
else:
raise Exception(f'API error with status code {response.status_code}')
return res
responses = pd.DataFrame([get_data(endpoint, i) for i in customerlist])

Unable to store pandas data frame as a csv

I am following this tutorial to retrieve data from news sites.
The main function is getDailyNews. It will loop on each news source, request the api, extract the data and dump it to a pandas DataFrame and then export the result into csv file.
But when I ran the code, I am getting an error.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from tqdm import tqdm, tqdm_notebook
from functools import reduce
def getSources():
source_url = 'https://newsapi.org/v1/sources?language=en'
response = requests.get(source_url).json()
sources = []
for source in response['sources']:
sources.append(source['id'])
return sources
def mapping():
d = {}
response = requests.get('https://newsapi.org/v1/sources?language=en')
response = response.json()
for s in response['sources']:
d[s['id']] = s['category']
return d
def category(source, m):
try:
return m[source]
except:
return 'NC'
def getDailyNews():
sources = getSources()
key = '96f279e1b7f845669089abc016e915cc'
url = 'https://newsapi.org/v1/articles?source={0}&sortBy={1}&apiKey={2}'
responses = []
for i, source in tqdm_notebook(enumerate(sources), total=len(sources)):
try:
u = url.format(source, 'top', key)
except:
u = url.format(source, 'latest', key)
response = requests.get(u)
r = response.json()
try:
for article in r['articles']:
article['source'] = source
responses.append(r)
except:
print('Rate limit exceeded ... please wait and retry in 6 hours')
return None
articles = list(map(lambda r: r['articles'], responses))
articles = list(reduce(lambda x,y: x+y, articles))
news = pd.DataFrame(articles)
news = news.dropna()
news = news.drop_duplicates()
news.reset_index(inplace=True, drop=True)
d = mapping()
news['category'] = news['source'].map(lambda s: category(s, d))
news['scraping_date'] = datetime.now()
try:
aux = pd.read_csv('./data/news.csv')
aux = aux.append(news)
aux = aux.drop_duplicates('url')
aux.reset_index(inplace=True, drop=True)
aux.to_csv('./data/news.csv', encoding='utf-8', index=False)
except:
news.to_csv('./data/news.csv', index=False, encoding='utf-8')
print('Done')
if __name__=='__main__':
getDailyNews()
Error:
FileNotFoundError: [Errno 2] No such file or directory: './data/news.csv'
I know that I have to give the path name in pd.read_csv but I don't know which path I have to give here.
This error would make sense if there wasn't already a data folder in the directory you are executing this program from. There is a similar problem in the post here.

Python check if website exists for a list of websites

I want to check if a website exists, given a list of websites in the format XXXXX.com, where XXXXX=a 5 digit number. So I want to go through from 00000 up to 99999 and see if those variants of the website exist.
I want to do something like
import requests
request = requests.get('http://www.example.com')
if request.status_code == 200:
print('Web site exists')
else:
print('Web site does not exist')
But generate a list of some sort (or even just export a list to csv), so for each URL, i know if it exists or not.
Any advice would be great!
I'm going to make an assumption that you have a large list of URLs and you want to read them in from some source file, let's say a text file, rather than hard-coding a large list of URLs in Python file, right. If that's the case, run the script below and you'll get what you want.
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
start = time.time()
file = open('C:\\your_path\\check_me.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
print(urls)
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: {}'.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: {}'.format(e.reason) + ', ' + url)
else:
# 200
# ...
print('good' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=20)
result = p.map(checkurl, urls)
print("done in : ", time.time()-start)
Try combining xrange and the string zfill method in a loop.
import requests
def test_for_200(url):
req = requests.get(url)
return req.status_code == 200
def numbers():
for n in xrange(100000):
yield str(n).zfill(5)
results = {}
for num in numbers():
url = "http://{}.com".format(num)
results[num] = test_for_200(url)
results will look something like this:
>>> results
{'00000': True, '00001': False, ...}

Categories

Resources