I am using the full archive search of the Twitter API to extract data on past events. I have downloaded the code sample and modified it a bit to also save my data to a file on my local drive, and this is all working well. But I do not know how to implement pagination.
When working with tweepy, there is a special .pages() function, but my current script uses requests.
I tried adding a while loop in my main function using ["next_token"], but I did not really understand the Twitter documentation and could not make it work.
Here is what I have got so far:
# Extended script for full archive search with Twitter academic API
# based on a sample provided by Twitter
# for documentation, see https://developer.twitter.com/en/products/twitter-api/academic-research
import requests
import os
import json
# STEP 1: add bearer token for your academic Twitter API dev account
bearer_token = "MYTOKEN"
# STEP 2: define which API endpoint to query: "all" or "recent"
search_url = "https://api.twitter.com/2/tweets/search/all"
# Optional params:
# start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
# STEP 3: define query parameters
query_params = {'query': '#WEURO2022',
'tweet.fields': 'author_id,conversation_id,created_at',
'expansions': 'geo.place_id',
'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type',
'user.fields': 'created_at,description,entities,id,location,name',
'start_time': '2022-02-15T00:00:01.000Z',
'end_time': '2022-09-16T23:59:59.000Z',
'max_results':'500'}
def bearer_oauth(r):
"""
Method required by bearer token authentication.
"""
r.headers["Authorization"] = f"Bearer {bearer_token}"
r.headers["User-Agent"] = "v2RecentSearchPython"
return r
def connect_to_endpoint(url, params):
response = requests.get(url, auth=bearer_oauth, params=params)
if response.status_code == 200:
print("Ready to go!")
if response.status_code != 200:
raise Exception(response.status_code, response.text)
json_data=response.json()
# return json_data
# write data to JSON file
with open('C:\\Users\\####\\Downloads\\MyTweets.json', 'a') as json_f:
json.dump(json_data, json_f)
print("JSON data written to file!")
def main():
json_response = connect_to_endpoint(search_url, query_params)
while json_response["meta"]["next_token"]:
query_params["next_token"] = json_response["meta"]["next_token"]
if __name__ == "__main__":
main()
Can you help me fix this, or point me to a tutorial for less experienced users?
I have found a way to fix my pagination issue, but I dare say that it is a most un-elegant solution:
# Extended script for full archive search with Twitter academic API
# based on a sample provided by Twitter
# for documentation, see https://developer.twitter.com/en/products/twitter-api/academic-research
import requests
import os
import json
import urllib
# STEP 1: add bearer token for your academic Twitter API dev account
bearer_token = "MYTOKEN"
# STEP 2: define which API endpoint to query: "all" or "recent"
search_url = "https://api.twitter.com/2/tweets/search/all"
token_url= "https://api.twitter.com/2/tweets/search/all?next_token="
# Optional params:
# start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
# STEP 3: define query parameters and number of pages to be retrieved
# query example: (from:twitterdev -is:retweet) OR #twitterdev
query_params = {'query': '((from:Eurovision) OR #esc OR #esc2018) (#ISR OR #NettaBarzilai OR #NettaBarzilai) lang:en',
'tweet.fields': 'author_id,conversation_id,created_at',
'expansions': 'geo.place_id',
'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type',
'user.fields': 'created_at,description,entities,id,location,name',
'start_time': '2018-02-15T00:00:01.000Z',
'end_time': '2018-07-16T23:59:59.000Z',
'max_results':'500'}
pages = 20
token_list=[]
# STEP 4: authenticate
def bearer_oauth(r):
"""
Method required by bearer token authentication.
"""
r.headers["Authorization"] = f"Bearer {bearer_token}"
r.headers["User-Agent"] = "v2RecentSearchPython"
return r
# STEP 5: connect to endpoint and run query
def connect_to_endpoint(url, params, next_token):
try:
if (len(token_list[-1]) >= 1):
next_token=token_list[-1]
target=[token_url, str(next_token)]
url="".join(target)
print(url)
else:
url = search_url
print(url)
except IndexError:
url = search_url
print(url)
response = requests.get(url, auth=bearer_oauth, params=params)
if response.status_code == 200:
print("Ready to go!")
if response.status_code != 200:
raise Exception(response.status_code, response.text)
json_data=response.json()
next_token=json_data["meta"]["next_token"]
token_list.append(next_token)
print(token_list)
# STEP 6: write data to JSON file
with open('C:\\Users\\xxx\\yyy\\NettaESC2018_tweets.json', 'a') as json_f:
json.dump(json_data, json_f)
print("JSON data written to file!")
def main():
for p in range(0, pages):
try:
json_response = connect_to_endpoint(url, query_params, next_token)
except KeyError:
print("No more tweets found!")
break
if __name__ == "__main__":
main()
If anyone has a better suggestion, I am looking forward to it!
Related
consider im write test api
its enought to just check if equal to the code (200,201,etc)
and print the content like this example:
import requests
import pytest
def test_one():
res = requests.get("someurl")
assert res.status_code == 200
print(res.json())
def test_one():
data = {"name":"string"}
res = requests.put("someurl",json=data)
assert res.status_code == 200
print(res.json())
or im need also check if the contents are same like the excepted ?
like this example:
def test_one():
data = {"name":"string"}
res = requests.put("someurl",json=data)
assert res.status_code == 200
assert data == res.json()
There are a lot of things to validate, the most common being the response json
import json
response_json = json.loads(response.text)
from here you can validate the values in dict format.
sometimes you need to validate the response json schema also
I have data set of retrieved tweets via the Twitter streaming API.
However, I regularly want to be updated about how the public metrics change. Therefore, I wrote a code to request those public metrics:
def create_url():
tweet_fields = "tweet.fields=public_metrics"
tweets_data_path = 'dataset.txt'
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
df = pd.DataFrame.from_dict(pd.json_normalize(tweets_data), orient='columns')
df_id = (str(str((df['id'].tolist()))[1:-1])).replace(" ", "")
ids = "ids=" + df_id
url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
return url
def bearer_oauth(r):
r.headers["Authorization"] = f"Bearer {'AAAAAAAAAAAAAAAAAAAAAN%2B7QwEAAAAAEG%2BzRZkmZ4HGizsKCG3MkwlaRzY%3DOwuZeaeHbeMM1JDIafd5riA1QdkDabPiELFsguR4Zba9ywzzOQ'}"
r.headers["User-Agent"] = "v2TweetLookupPython"
return r
def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {}".format(
response.status_code, response.text
)
)
return response.json()
def main():
url = create_url()
json_response = connect_to_endpoint(url)
print(json.dumps(json_response, indent=3, sort_keys=True))
if __name__ == "__main__":
main()
Unfortunately, my data set has more than 100 id's in it and I want to retrieve the metrics for all of them. As I can only request 100 id's at a time, can you maybe help me on how to do that?
Also, I would like to make the request daily at midnight and then store the file in a txt document, maybe you can also help me with that?
You can chunk your data and send it in batches using itertools.islice.
test.py:
import reprlib
from itertools import islice
import pandas as pd
BASE_URL = "https://api.twitter.com/2/tweets"
CHUNK = 100
def req(ids):
tmp = reprlib.repr(ids) # Used here just to shorten the output
print(f"{BASE_URL}?ids={tmp}")
def main():
df = pd.DataFrame({"id": range(1000)})
it = iter(df["id"])
while chunk := tuple(islice(it, CHUNK)):
ids = ",".join(map(str, chunk))
req(ids)
if __name__ == "__main__":
main()
Test:
$ python test.py
https://api.twitter.com/2/tweets?ids='0,1,2,3,4,5,...5,96,97,98,99'
https://api.twitter.com/2/tweets?ids='100,101,102,...6,197,198,199'
https://api.twitter.com/2/tweets?ids='200,201,202,...6,297,298,299'
https://api.twitter.com/2/tweets?ids='300,301,302,...6,397,398,399'
https://api.twitter.com/2/tweets?ids='400,401,402,...6,497,498,499'
https://api.twitter.com/2/tweets?ids='500,501,502,...6,597,598,599'
https://api.twitter.com/2/tweets?ids='600,601,602,...6,697,698,699'
https://api.twitter.com/2/tweets?ids='700,701,702,...6,797,798,799'
https://api.twitter.com/2/tweets?ids='800,801,802,...6,897,898,899'
https://api.twitter.com/2/tweets?ids='900,901,902,...6,997,998,999'
Note: You'll make multiple requests with this approach so keep in mind any rate limits.
I need to make a request to an API that only responses with maximum of 200 results. If the total amount of data is more than 200, the API responses also with a parameter lastKey that I need to pass to a new request. When all the data has been returned the lastKey -param is not returned anymore.
My question is how to do it in a simple, clean way? This is how I make the first request and I can see if there is the lastKey -param or not:
url = 'https://example.com'
moreData = False
with requests.Session() as api:
data = requests.get(url)
try:
data.raise_for_status()
except HTTPError as e:
return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)
result = data.json()
if 'lastKey' in result:
url = 'https://example.com&lastKey=' + result['lastKey']
moreData = True
How could I do this whole thing for example inside a while -loop?
Just get the first result out of the while loop, then call your api while you have "lastkey" in the result
url = 'https://example.com'
with requests.Session() as api:
data = requests.get(url)
try:
data.raise_for_status()
except HTTPError as e:
return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)
result = data.json()
while 'lastKey' in result:
url = 'https://example.com&lastKey=' + result['lastKey']
with requests.Session() as api:
data = requests.get(url)
try:
data.raise_for_status()
except HTTPError as e:
return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)
result = data.json()
I'm requesting Microsoft's Graph API, where I'm using the following function to request multiple pages. I'm trying to request all pages, merge the json files and finally write them to a pandas dataframe.
v = "v1.0"
r = "/users?$filter=userType eq 'Member'&$select=displayName,givenName,jobTitle,mail,department&$top=200"
def query(v, r):
all_records = []
url = uri.format(v=v, r=r)
while True:
if not url:
break
result = requests.get(url, headers=headers)
if result.status_code == 200:
json_data = json.loads(result.text)
all_records = all_records + json_data["value"]
url = json_data["#odata.nextLink"]
return all_records
The while-loop goes through all the pages, but when I run the function I'm getting a error:
KeyError: '#odata.nextLink'
I assume this is because the loop reaches the final page, and thus the '#odata.nextLink' cannot be found. But how can I handle this?
You are doing
url = json_data["#odata.nextLink"]
which suggest json_data is dict, so you should be able to use .get method which returns default value when key not found (None by default), please try doing following and write if it does work as excepted:
url = json_data.get("#odata.nextLink")
if url is None:
print("nextLink not found")
else:
print("nextLink found")
I encounter an index out of range error when I try to get the number of contributors of a GitHub project in a loop. After some iterations (which are working perfectly) it just throws that exception. I have no clue why ...
for x in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number) # prints the correct number until the exception
Here's the exception.
----> 4 contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
IndexError: list index out of range
It seems likely that you're getting a 429 - Too many requests since you're firing requests of one after the other.
You might want to modify your code as such:
import time
for index in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number)
time.sleep(3) # Wait a bit before firing of another request
Better yet would be:
import time
for index in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
if r.status_code in [200]: # Check if the request was successful
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number)
else:
print("Failed fetching page, status code: " + str(r.status_code))
time.sleep(3) # Wait a bit before firing of another request
Now this works perfectly for me while using the API. Probably the cleanest way of doing it.
import requests
import json
url = 'https://api.github.com/repos/valentinxxx/nginxconfig.io/commits?&per_page=100'
response = requests.get(url)
commits = json.loads(response.text)
commits_total = len(commits)
page_number = 1
while(len(commits) == 100):
page_number += 1
url = 'https://api.github.com/repos/valentinxxx/nginxconfig.io/commits?&per_page=100'+'&page='+str(page_number)
response = requests.get(url)
commits = json.loads(response.text)
commits_total += len(commits)
GitHub is blocking your repeated requests. Do not scrape sites in quick succession, many website operators actively block too many requests. As a result, the content that is returned no longer matches your XPath query.
You should be using the REST API that GitHub provides to retrieve project stats like the number of contributors, and you should implement some kind of rate limiting. There is no need to retrieve the same number 100 times, contributor counts do not change that rapidly.
API responses include information on how many requests you can make in a time window, and you can use conditional requests to only incur rate limit costs when the data actually has changed:
import requests
import time
from urllib.parse import parse_qsl, urlparse
owner, repo = 'tipsy', 'profile-summary-for-github'
github_username = '....'
# token = '....' # optional Github basic auth token
stats = 'https://api.github.com/repos/{}/{}/contributors'
with requests.session() as sess:
# GitHub requests you use your username or appname in the header
sess.headers['User-Agent'] += ' - {}'.format(github_username)
# Consider logging in! You'll get more quota
# sess.auth = (github_username, token)
# start with the first, move to the last when available, include anonymous
last_page = stats.format(owner, repo) + '?per_page=100&page=1&anon=true'
while True:
r = sess.get(last_page)
if r.status_code == requests.codes.not_found:
print("No such repo")
break
if r.status_code == requests.codes.no_content:
print("No contributors, repository is empty")
break
if r.status_code == requests.codes.accepted:
print("Stats not yet ready, retrying")
elif r.status_code == requests.codes.not_modified:
print("Stats not changed")
elif r.ok:
# success! Check for a last page, get that instead of current
# to get accurate count
link_last = r.links.get('last', {}).get('url')
if link_last and r.url != link_last:
last_page = link_last
else:
# this is the last page, report on count
params = dict(parse_qsl(urlparse(r.url).query))
page_num = int(params.get('page', '1'))
per_page = int(params.get('per_page', '100'))
contributor_count = len(r.json()) + (per_page * (page_num - 1))
print("Contributor count:", contributor_count)
# only get us a fresh response next time
sess.headers['If-None-Match'] = r.headers['ETag']
# pace ourselves following the rate limit
window_remaining = int(r.headers['X-RateLimit-Reset']) - time.time()
rate_remaining = int(r.headers['X-RateLimit-Remaining'])
# sleep long enough to honour the rate limit or at least 100 milliseconds
time.sleep(max(window_remaining / rate_remaining, 0.1))
The above uses a requests session object to handle repeated headers and ensure that you get to reuse connections where possible.
A good library such as github3.py (incidentally written by a requests core contributor) will take care of most of those details for you.
If you do want to persist on scraping the site directly, you do take a risk that the site operators block you altogether. Try to take some responsibility by not hammering the site continually.
That means that at the very least, you should honour the Retry-After header that GitHub gives you on 429:
if not r.ok:
print("Received a response other that 200 OK:", r.status_code, r.reason)
retry_after = r.headers.get('Retry-After')
if retry_after is not None:
print("Response included a Retry-After:", retry_after)
time.sleep(int(retry_after))
else:
# parse OK response