How to check if file is already in the cache?

How to check if file is already in the cache? - python

i am working on a Tool which can post Ingame News Updates from Games on your Twitter Account, in the last few Days I searched for an solution to find a way to look if an image is already in the cache so it wont be posted again if a News Feed Update of an Game is online, it should only post the new ones and skip the old ones (actually it posts every active News Feed again), the problem is can't do it. I tested it about 100 Times but it wont work. I really hope that one of you can help me with this issue because it would be fantastic if this Tool would work with an method like this. Thanks for every single help in advance.
Here is my code:
import tweepy
import time
from colorama import *
init()
auth = tweepy.OAuthHandler('API', 'APISECRET')
auth.set_access_token('ACESS', 'ACESSSECRET')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
#-----
footer = '#Fortnite'
delay = 5
saveImages = True
#-----
while 1:
response = requests.get('https://fortnite-api.com/v2/news/br')
if response:
newsDataLoop = response.json()["data"]
print("2 - Checking for change in news feed...")
if newsData != newsDataLoop:
#if loop == True:
print("News Feed has changed...")
for i in newsDataLoop["motds"]:
try:
print("Saving: "+i["id"])
url = i["image"]
r = requests.get(url, allow_redirects=True)
open("NewsImages/"+i["id"]+'.png', 'wb').write(r.content)
print("Saved: "+i["id"])
try:
api = tweepy.API(auth)
api.update_with_media("NewsImages/"+i["id"]+'.png',"Fortnite News Update:\n\n"+i["title"]+":\n"+i["body"]+"\n\n"+footer)
print("Tweeted: "+i["id"])
except:
print("Failed to tweet: "+i["id"])
if saveImages == 'False':
os.remove("NewsImages/"+i["id"]+'.png')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
except:
print("Error in tweeting news feed: skipping")
print("Finished news feed publishing")
else:
print("FAILED TO GRAB NEWS DATA: URL DOWN")
time.sleep(delay)

You need to check each motd to see if it existed in the old dataset.
import tweepy
import time
from colorama import *
init()
auth = tweepy.OAuthHandler('API', 'APISECRET')
auth.set_access_token('ACESS', 'ACESSSECRET')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
#-----
footer = '#Fortnite'
delay = 5
saveImages = True
#-----
while 1:
response = requests.get('https://fortnite-api.com/v2/news/br')
if response:
newsDataLoop = response.json()["data"]
print("2 - Checking for change in news feed...")
if newsData != newsDataLoop:
#if loop == True:
print("News Feed has changed...")
for i in newsDataLoop["motds"]:
if i in newsData["motds"]:
# has already been posted
print("Already posted")
continue
try:
print("Saving: "+i["id"])
url = i["image"]
r = requests.get(url, allow_redirects=True)
open("NewsImages/"+i["id"]+'.png', 'wb').write(r.content)
print("Saved: "+i["id"])
try:
api = tweepy.API(auth)
api.update_with_media("NewsImages/"+i["id"]+'.png',"Fortnite News Update:\n\n"+i["title"]+":\n"+i["body"]+"\n\n"+footer)
print("Tweeted: "+i["id"])
except:
print("Failed to tweet: "+i["id"])
if saveImages == 'False':
os.remove("NewsImages/"+i["id"]+'.png')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
except:
print("Error in tweeting news feed: skipping")
print("Finished news feed publishing")
else:
print("FAILED TO GRAB NEWS DATA: URL DOWN")
time.sleep(delay)

Related

JSONDecodeError: Extra data: line 1 column 8 (char 7)

I've followed a tutorial to scrape from a facebook profile and I keep getting this error:
JSONDecodeError: Extra data: line 1 column 8 (char 7)
Does anyone know what the problem might be?
Here is my python script:
def get_bs(session, url):
#Makes a GET requests using the given Session objectand returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml'
#To login
def make_login(session, base_url, credentials):
#Returns a Session object logged in with credentials.
login_form_url = '/login/device-based/regular/login/?refsrc=https%3A'\
'%2F%2Fmobile.facebook.com%2Flogin%2Fdevice-based%2Fedit-user%2F&lwv=100'
params = {'email':credentials['email'], 'pass':credentials['pass']}
while True:
time.sleep(3)
logged_request = session.post(base_url+login_form_url, data=params)
if logged_request.ok:
logging.info('[*] Logged in.')
break
#Crawling FB
def crawl_profile(session, base_url, profile_url, post_limit):
#Goes to profile URL, crawls it and extracts posts URLs.
profile_bs = get_bs(session, profile_url)
n_scraped_posts = 0
scraped_posts = list()
posts_id = None
while n_scraped_posts < post_limit:
try:
posts_id = 'recent'
posts = profile_bs.find('div', id=posts_id).div.div.contents
except Exception:
posts_id = 'structured_composer_async_container'
posts = profile_bs.find('div', id=posts_id).div.div.contents
posts_urls = [a['href'] for a in profile_bs.find_all('a', text='Full Story')]
for post_url in posts_urls:
# print(post_url)
try:
post_data = scrape_post(session, base_url, post_url)
scraped_posts.append(post_data)
except Exception as e:
logging.info('Error: {}'.format(e))
n_scraped_posts += 1
if posts_completed(scraped_posts, post_limit):
break
show_more_posts_url = None
if not posts_completed(scraped_posts, post_limit):
show_more_posts_url = profile_bs.find('div', id=posts_id).next_sibling.a['href']
profile_bs = get_bs(session, base_url+show_more_posts_url)
time.sleep(3)
else:
break
return scraped_posts
def get_bs(session, url):
#Makes a GET requests using the given Session object and returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
time.sleep(3)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml')
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Function for profile URL and creditials for FB
def json_to_obj(filename):
#Extracts data from JSON file and saves it on Python object
obj = None
with open(filename) as json_file:
obj = json.loads(json_file.read())
return obj
def save_data(data):
#Converts data to JSON.
with open('profile_posts_data.json', 'w') as json_file:
json.dump(data, json_file, indent=4)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj('credentials.json')
profiles_urls = json_to_obj('profiles_urls.json')
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\credentials.json")
profiles_urls = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\profiles_urls.json")
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)

Setting an expiration on a function for a retry

I am harvesting captcha, and captcha responses are only good for 2 minutes. I am trying to figure out how to set an "Expired, fetching again" function
def captchaToken():
global captoken
s = requests.Session()
# here we post site key to 2captcha to get captcha ID (and we parse it here too)
captcha_id = s.post("http://2captcha.com/in.php?key={}&method=userrecaptcha&googlekey={}&pageurl={}".format(APItoken, sitekey, url)).text.split('|')[1]
#time.sleep(5)
print("Waiting for captcha...")
# then we parse gresponse from 2captcha response
recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(APItoken, captcha_id)).text
print("Solving captcha...")
while 'CAPCHA_NOT_READY' in recaptcha_answer:
print("Waiting for a response . . .")
time.sleep(2.5)
recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(APItoken, captcha_id)).text
recaptcha_answer = recaptcha_answer.split('|')[1]
captoken = recaptcha_answer
print(Fore.MAGENTA + "Captcha Response: "+recaptcha_answer)

I'd suggest:
while 1:
captcha = get_captcha()
request = post_captcha(captcha)
start = time.time()
while 1:
response = poll_for_result(request)
if start + 120 < time.time():
time_is_up()
break
if response:
return deal_with_response(response)

python request urls parallel [duplicate]

This question already has an answer here:
How to send multiple http requests python
(1 answer)
Closed 6 years ago.
I created the following script to download images from an API endpoint which works as intended. Thing is that it is rather slow as all the requests have to wait on each other. What is the correct way to make it possible to still have the steps synchronously for each item I want to fetch, but make it parallel for each individual item. This from an online service called
servicem8
So what I hope to achieve is:
fetch all possible job ids => keep name/and other info
fetch name of the customer
fetch each attachment of a job
These three steps should be done for each job. So I could make things parallel for each job as they do not have to wait on each other.
Update:
Problem I do not understand is how can you make sure that you bundle for example the three calls per item in one call as its only per item that I can do things in parallel so for example when I want to
fetch item( fetch name => fetch description => fetch id)
so its the fetch item I want to make parallel?
The current code I have is working but rather slow:
import requests
import dateutil.parser
import shutil
import os
user = "test#test.com"
passw = "test"
print("Read json")
url = "https://api.servicem8.com/api_1.0/job.json"
r = requests.get(url, auth=(user, passw))
print("finished reading jobs.json file")
scheduled_jobs = []
if r.status_code == 200:
for item in r.json():
scheduled_date = item['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 10:
if parsed_date.day == 10:
url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item[
'company_uuid'])
c = requests.get(url_customer, auth=(user, passw))
cus_name = c.json()['name']
scheduled_jobs.append(
[item['uuid'], item['generated_job_id'], cus_name])
except ValueError:
pass
for job in scheduled_jobs:
print("fetch for job {}".format(job))
url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[
0])
r = requests.get(url, auth=(user, passw))
if r.json() == []:
pass
for attachment in r.json():
if attachment['active'] == 1 and attachment['file_type'] != '.pdf':
print("fetch for attachment {}".format(attachment))
url_staff = "https://api.servicem8.com/api_1.0/Staff.json?%24filter=uuid%20eq%20{}".format(
attachment['created_by_staff_uuid'])
s = requests.get(url_staff, auth=(user, passw))
for staff in s.json():
tech = "{}_{}".format(staff['first'], staff['last'])
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream=True)
if r.status_code == 200:
creation_date = dateutil.parser.parse(
attachment['timestamp']).strftime("%d.%m.%y")
if not os.path.exists(os.getcwd() + "/{}/{}".format(job[2], job[1])):
os.makedirs(os.getcwd() + "/{}/{}".format(job[2], job[1]))
path = os.getcwd() + "/{}/{}/SC -O {} {}{}".format(
job[2], job[1], creation_date, tech.upper(), attachment['file_type'])
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print(r.text)
Update [14/10]
I updated the code in the following way with some hints given. Thanks a lot for that. Only thing I could optimize I guess is the attachment downloading but it is working fine now. Funny thing I learned is that you cannot create a CON folder on a windows machine :-) did not know that.
I use pandas as well just to try to avoid some loops in my list of dicts but not sure if I am already most performant. Longest is actually reading in the full json files. I fully read them in as I could not find an API way of just telling the api, return me only the jobs from september 2016. The api query function seems to work on eq/lt/ht.
import requests
import dateutil.parser
import shutil
import os
import pandas as pd
user = ""
passw = ""
FOLDER = os.getcwd()
headers = {"Accept-Encoding": "gzip, deflate"}
import grequests
urls = [
'https://api.servicem8.com/api_1.0/job.json',
'https://api.servicem8.com/api_1.0/Attachment.json',
'https://api.servicem8.com/api_1.0/Staff.json',
'https://api.servicem8.com/api_1.0/Company.json'
]
#Create a set of unsent Requests:
print("Read json files")
rs = (grequests.get(u, auth=(user, passw), headers=headers) for u in urls)
#Send them all at the same time:
jobs,attachments,staffs,companies = grequests.map(rs)
#create dataframes
df_jobs = pd.DataFrame(jobs.json())
df_attachments = pd.DataFrame(attachments.json())
df_staffs = pd.DataFrame(staffs.json())
df_companies = pd.DataFrame(companies.json())
#url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item['company_uuid'])
#c = requests.get(url_customer, auth=(user, passw))
#url = "https://api.servicem8.com/api_1.0/job.json"
#jobs = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading attachments json")
#url = "https://api.servicem8.com/api_1.0/Attachment.json"
#attachments = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading staff.json")
#url_staff = "https://api.servicem8.com/api_1.0/Staff.json"
#staffs = requests.get(url_staff, auth=(user, passw))
scheduled_jobs = []
if jobs.status_code == 200:
print("finished reading json file")
for job in jobs.json():
scheduled_date = job['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 9:
cus_name = df_companies[df_companies.uuid == job['company_uuid']].iloc[0]['name'].upper()
cus_name = cus_name.replace('/', '')
scheduled_jobs.append([job['uuid'], job['generated_job_id'], cus_name])
except ValueError:
pass
print("{} jobs to fetch".format(len(scheduled_jobs)))
for job in scheduled_jobs:
print("fetch for job attachments {}".format(job))
#url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[0])
if attachments == []:
pass
for attachment in attachments.json():
if attachment['related_object_uuid'] == job[0]:
if attachment['active'] == 1 and attachment['file_type'] != '.pdf' and attachment['attachment_source'] != 'INVOICE_SIGNOFF':
for staff in staffs.json():
if staff['uuid'] == attachment['created_by_staff_uuid']:
tech = "{}_{}".format(
staff['first'].split()[-1].strip(), staff['last'])
creation_timestamp = dateutil.parser.parse(
attachment['timestamp'])
creation_date = creation_timestamp.strftime("%d.%m.%y")
creation_time = creation_timestamp.strftime("%H_%M_%S")
path = FOLDER + "/{}/{}/SC_-O_D{}_T{}_{}{}".format(
job[2], job[1], creation_date, creation_time, tech.upper(), attachment['file_type'])
# fetch attachment
if not os.path.isfile(path):
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream = True)
if r.status_code == 200:
if not os.path.exists(FOLDER + "/{}/{}".format(job[2], job[1])):
os.makedirs(
FOLDER + "/{}/{}".format(job[2], job[1]))
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print("file already exists")
else:
print(r.text)

General idea is to use asynchronous url requests and there is a python module named grequests for that-https://github.com/kennethreitz/grequests
From Documentation:
import grequests
urls = [
'http://www.heroku.com',
'http://python-tablib.org',
'http://httpbin.org',
'http://python-requests.org',
'http://fakedomain/',
'http://kennethreitz.com'
]
#Create a set of unsent Requests:
rs = (grequests.get(u) for u in urls)
#Send them all at the same time:
grequests.map(rs)
And the resopnse
[<Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, None, <Response [200]>]

Pythonic way of using next-page token

A lot of requests now return page-tokens and I was curious what the most appropriate way of dealing with them was?
I have usually just gone with recursion:
def get_followers(user_id,
access_token,
cursor=''):
url = 'https://api.instagram.com/v1/users/%s/followed-by?access_token=%s&cursor=%s' % (my_user,
access_token,
cursor)
print(url)
response = requests.get(url)
json_out = json.loads(response.text)
for user in json_out['data']:
usrname = user['username']
my_followers.add_follower(usrname)
print("Added: %s" % usrname)
try:
cursor = json_out['pagination']['next_url']
get_followers(user_id, access_token, cursor)
except KeyError:
print("Finished with %d followers" % len(my_followers.followers))
return
However, perhaps a:
while True:
..
if condition:
break
Or some other implementation is seen as more efficient/pythonic?

You could change this to a generator function that yields the next follower when the structure is accessed, fetching more data only when necessary.
def get_followers(...)
...
while token is not None:
# fetch data
...
for user in json_out['data']:
yield user
...
# fetch new token
Then, iterate through this generator to apply your data handling. This also has the advantage of separating data acquisition and handling logic.
followers = get_followers(...)
for user in followers:
username = user['username']
my_followers.add_follower(username)
print("Added: %s" % username)

Thanks for the help, I have never used generators apart from the implicit (range vs xrange) distinction. I was curious if in this case it provides any speed/memory advantage?
My final code-snippet:
def get_followers(user_id,
access_token,
cursor=''):
""" Create a generator which will be accessed later to get the usernames of the followers"""
while cursor is not None:
# fetch data
url = 'https://api.instagram.com/v1/users/%s/followed-by?access_token=%s&cursor=%s' % (my_user,
access_token,
cursor)
print(url)
response = requests.get(url)
json_out = json.loads(response.text)
for user in json_out['data']:
yield user
# fetch new token
try:
cursor = json_out['pagination']['next_url']
except KeyError:
print("Finished with %d followers" % len(my_followers.followers))
cursor = None
followers_gen = get_followers(my_user, access_token) # Get followers
for usr in followers_gen:
usrname = usr['username']
my_followers.add_follower(usrname)
print("Added: %s" % usrname)
Compared to not using a generator:
def get_followers(user_id,
access_token,
cursor=''):
""" Create a generator which will be accessed later to get the usernames of the followers"""
while cursor is not None:
# fetch data
url = 'https://api.instagram.com/v1/users/%s/followed-by?access_token=%s&cursor=%s' % (my_user,
access_token,
cursor)
print(url)
response = requests.get(url)
json_out = json.loads(response.text)
for usr in json_out['data']:
usrname = usr['username']
my_followers.add_follower(usrname)
print("Added: %s" % usrname)
# fetch new token
try:
cursor = json_out['pagination']['next_url']
get_followers(user_id, access_token, cursor)
except KeyError:
print("Finished with %d followers" % len(my_followers.followers))
cursor = None
get_followers(my_user, access_token) # Get followers

Python script to harvest tweets to a MongoDb works with users but not hashtags. Any ideas why not?

I'm playing around the Twitter API and am in the process of developing a script to pull all Tweets with a certain hashtag down to a local mongoDB. I have it working fine when I'm downloading tweets from users, but when downloading tweets from a hashtag I get:
return loads(fp.read(),
AttributeError: 'int' object has no attribute 'read'
Can anyone offer their infinite wisdom into how I could get this script to work?
To run, save it as a .py file, cd to the folder and run:
python twitter.py
Code:
__author__ = 'Tom Cusack'
import pymongo
import oauth2 as oauth
import urllib2, json
import sys, argparse, time
def oauth_header(url, consumer, token):
params = {'oauth_version': '1.0',
'oauth_nonce': oauth.generate_nonce(),
'oauth_timestamp': int(time.time()),
}
req = oauth.Request(method = 'GET',url = url, parameters = params)
req.sign_request(oauth.SignatureMethod_HMAC_SHA1(),consumer, token)
return req.to_header()['Authorization'].encode('utf-8')
def main():
### Twitter Settings
numtweets = '32000'
verbose = 'store_true'
retweet = 'store_false'
CONSUMER_KEY = 'M7Xu9Wte0eIZvqhb4G9HnIn3G'
CONSUMER_SECRET = 'c8hB4Qwps2aODQUx7UsyzQuCRifEp3PKu6hPQll8wnJGIhbKgZ'
ACCESS_TOKEN = '3213221313-APuXuNjVMbRbZpu6sVbETbgqkponGsZJVT53QmG'
ACCESS_SECRET = 'BJHrqWC9ed3pA5oDstSMCYcUcz2pYF3DmJ7jcuDe7yxvi'
base_url = url = 'https://api.twitter.com/1.1/search/tweets.json?include_entities=true&count=200&q=#mongodb&include_rts=%s' % (retweet)
oauth_consumer = oauth.Consumer(key = CONSUMER_KEY, secret = CONSUMER_SECRET)
oauth_token = oauth.Token(key = ACCESS_TOKEN, secret = ACCESS_SECRET)
### Mongodb Settings
uri = 'mongodb://127.0.0.1:27017/SARKY'
if uri != None:
try:
conn = pymongo.MongoClient(uri)
print 'Pulling Tweets..'
except:
print 'Error: Unable to connect to DB. Check uri variable.'
return
uri_parts = pymongo.uri_parser.parse_uri(uri)
db = conn[uri_parts['database']]
db['twitter-harvest'].ensure_index('id_str')
### Helper Variables for Harvest
max_id = -1
tweet_count = 0
stream = 0
### Begin Harvesting
while True:
auth = oauth_header(url, oauth_consumer, oauth_token)
headers = {"Authorization": auth}
request = urllib2.Request(url, headers = headers)
try:
stream = urllib2.urlopen(request)
except urllib2.HTTPError, err:
if err.code == 404:
print 'Error: Unknown user. Check --user arg'
return
if err.code == 401:
print 'Error: Unauthorized. Check Twitter credentials'
return
tweet_list = json.load(stream)
if len(tweet_list) == 0:
print 'No tweets to harvest!'
return
if 'errors' in tweet_list:
print 'Hit rate limit, code: %s, message: %s' % (tweets['errors']['code'], tweets['errors']['message'])
return
if max_id == -1:
tweets = tweet_list
else:
tweets = tweet_list[1:]
if len(tweets) == 0:
print 'Finished Harvest!'
return
for tweet in tweets:
max_id = id_str = tweet['id_str']
try:
if tweet_count == numtweets:
print 'Finished Harvest- hit numtweets!'
return
if uri != None:
db[user].update({'id_str':id_str},tweet,upsert = True)
else:
print tweet['text']
tweet_count+=1
if verbose == True and uri != None:
print tweet['text']
except Exception, err:
print 'Unexpected error encountered: %s' %(err)
return
url = base_url + '&max_id=' + max_id
if __name__ == '__main__':
try:
main()
except SystemExit as e:
if e.code == 0:
pass

You initially set stream = 0. When your try...except block catches a HTTP response with a code that isn't 404 or 401, stream is still equal to 0, but your except block doesn't break out of the function.
I'd look more closely at what this response says.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to check if file is already in the cache? - python

Related

JSONDecodeError: Extra data: line 1 column 8 (char 7)

Setting an expiration on a function for a retry

python request urls parallel [duplicate]

Pythonic way of using next-page token

Python script to harvest tweets to a MongoDb works with users but not hashtags. Any ideas why not?

Categories

Resources