Python "while true" loop does NOT end (using Python Tweepy) - python

The following code seems to be mostly "working". Meaning that it scrapes all the tweets from Twitter API for a specified day. Though it seems the while True loop never breaks and I don't see the expected "Finished!!" string even through the csv file is complete.
import tweepy
import time
import csv
ckey = "xxx"
csecret = "xxx"
atoken = "xxx-xxx"
asecret = "xxx"
OAUTH_KEYS = {'consumer_key':ckey, 'consumer_secret':csecret,
'access_token_key':atoken, 'access_token_secret':asecret}
auth = tweepy.OAuthHandler(OAUTH_KEYS['consumer_key'], OAUTH_KEYS['consumer_secret'])
api = tweepy.API(auth)
startSince = '2014-10-03'
endUntil = '2014-10-04'
suffix = '_03OCT2014.csv'
searchTerms = 'xyz'
tweets = tweepy.Cursor(api.search, q=searchTerms,
since=startSince, until=endUntil).items()
while True:
try:
for tweet in tweets:
placeHolder = []
placeHolder.append(tweet.author.name.encode('utf8'))
placeHolder.append(tweet.author.screen_name.encode('utf8'))
placeHolder.append(tweet.created_at)
prefix = 'TweetData_xyz'
wholeFileName = prefix + suffix
with open(wholeFileName, "ab") as f:
writeFile = csv.writer(f)
writeFile.writerow(placeHolder)
except tweepy.TweepError:
time.sleep(60*15)
continue
except IOError:
time.sleep(60*5)
continue
except StopIteration:
break
print "Finished!!!"

StopIteration is never raised in your code. The for statement would catch it if it were raised by tweepy.Cursor().items(), it would not propagate further.
Just break out if the for loop ends:
while True:
try:
for tweet in tweets:
# do stuff
# completed iterating successfully
break
and remove the except StopIteration: handler altogether.

You code has no exit condition.
It seems you don;t want to exit that loop if you got an error thrown. So I assume that when you reach end ob your while body you would like to exit, yes?
def process_tweet(tweet):
placeHolder = []
placeHolder.append(tweet.author.name.encode('utf8'))
placeHolder.append(tweet.author.screen_name.encode('utf8'))
placeHolder.append(tweet.created_at)
prefix = 'TweetData_xyz'
wholeFileName = prefix + suffix
with open(wholeFileName, "ab") as f:
writeFile = csv.writer(f)
writeFile.writerow(placeHolder)
while True:
try:
for tweet in tweets:
process_tweet(tweet)
break
except tweepy.TweepError:
time.sleep(60*15)
continue
except IOError:
time.sleep(60*5)
continue
except StopIteration:
break
print "Finished!!!"

Related

Is there a way to index a tweet with Tweepy?

I'm trying to script a Twitter bot that will respond to mentions that have equations in them. First, I got the mentions to work (it would respond to anyone who mentions it). Then, I tried to implement the math function which uses regex (I had already created this, it was just a means of integrating it into the main bot program).
The Code for Mentions:
import mathbotcreds as mtc
import logging
import re
import tweepy
from time import sleep as wait
auth = tweepy.OAuthHandler(mtc.CONSUMER_KEY, mtc.CONSUMER_SECRET)
auth.set_access_token(mtc.ACCESS_TOKEN, mtc.ACCESS_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True,
wait_on_rate_limit_notify=True,
retry_count=2)
try:
api.verify_credentials()
print("Authentication Successful!")
except:
print("Error during authentication! :(")
mentions = api.mentions_timeline()
pattern = r'([0-9]+.*[-+*/%].*[0-9]+)+'
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
def check_mentions(api, since_id):
logger.info("Collecting mentions... ")
new_since_id = since_id
for tweet in tweepy.Cursor(api.mentions_timeline, since_id=since_id).items():
new_since_id = max(tweet.id, new_since_id)
if tweet.in_reply_to_status_id is not None:
continue
api.update_status(
status=f"Hello! \n\nIt worked! \nYay! ^-^ \n\n (You said: \"{tweet.text}\".)",
in_reply_to_status_id=tweet.id)
return new_since_id
def main():
since_id = 1
while True:
since_id = check_mentions(api, since_id)
logger.info("Waiting... ")
wait(15)
if __name__ == "__main__":
logger.info("Running script... ")
wait(1)
main()
# for m in mentions:
# api.update_status(f"#{m.user.screen_name} Hello! \nYou said: \n{m.text}", m.id)
# wait(15)
The Code for Mentions and Equations Functions:
import mathbotcreds as mtc
import logging
import re
import tweepy
from time import sleep as wait
auth = tweepy.OAuthHandler(mtc.CONSUMER_KEY, mtc.CONSUMER_SECRET)
auth.set_access_token(mtc.ACCESS_TOKEN, mtc.ACCESS_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True,
wait_on_rate_limit_notify=True,
retry_count=2)
try:
api.verify_credentials()
print("Authentication Successful!")
except:
print("Error during authentication! :(")
mentions = api.mentions_timeline()
pattern = r'([0-9]+.*[-+*/%].*[0-9]+)+'
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
def check_mentions(api, since_id):
logger.info("Collecting mentions... ")
new_since_id = since_id
for tweet in tweepy.Cursor(api.mentions_timeline, since_id=since_id).items():
match = re.search(pattern, tweet.text)
equation = tweet.text[match.start():match.end()]
new_since_id = max(tweet.id, new_since_id)
if tweet.in_reply_to_status_id is not None:
continue
if match:
ans = eval(tweet.text[match.start():match.end()])
api.update_status(
status=f"The answer to {str(equation)} is {ans}. ",
in_reply_to_status_id=tweet.id)
elif not match:
api.update_status(
status=f"Hello! \n\nIt worked! \nYay! ^-^ \n\n (You said: \"{tweet.text}\".)",
in_reply_to_status_id=tweet.id)
return new_since_id
def main():
since_id = 1
while True:
since_id = check_mentions(api, since_id)
logger.info("Waiting... ")
wait(15)
if __name__ == "__main__":
logger.info("Running script... ")
wait(1)
main()
# for m in mentions:
# api.update_status(f"#{m.user.screen_name} Hello! \nYou said: \n{m.text}", m.id)
# wait(15)
When I run this, I get an error stating, AttributeError: 'NoneType' object has no attribute 'start' on the eval() function (equation = tweet.text[match.start():match.end()]). I have researched this and how to index tweet text (with Tweepy). I'm confused as to why I get a NoneType error if I have a function directly above the eval() function. Shouldn't this catch that? Why does this happen?
Thanks!
re.search returns a NoneType when it doesn't find a match. You should check the return value before using it, like this:
match = re.search(pattern, tweet.text)
if match:
equation = tweet.text[match.start():match.end()]

How to callback on error messages (python)

I am a decently new python coder and i wish to create a twitter bot in which everytime it retweets, it favourites the tweet as well. I am not exactly sure how to do that but when the bot searches, it sends out an error message of 'list index out of range'.
import tweepy, time, traceback
from tweepy.auth import OAuthHandler
from tweepy.streaming import StreamListener, Stream
ckey = ''
csecret = ''
atoken = ''
asecret = ''
auths = OAuthHandler(ckey, csecret)
auths.set_access_token(atoken, asecret)
api = tweepy.API(auths)
class listener(StreamListener):
def on_data(self, raw_data):
try:
tweet_text = raw_data.lower().split('"text":')[1].split('","source":"')[0].replace(",", "")
screen_name = raw_data.lower().split('"screen_name":"')[1].split('","location"')[0].replace(",", "")
tweet_cid = raw_data.split('"id:')[1].split('"id_str":')[0].replace(",", "")
#there is ment to be 4 spaces at tweet_text
accs = [''] # banned accounts screen name goes in here
words = ['hate' , 'derp' , 'racist' , 'evil' , 'keemstar' , 'mario' , 'kirby'] #banned words goes in here
if not any(acc in screen_name.lower() for acc in accs):
if not any(word in tweet_text.lower() for word in words):
fav(tweet_cid)
follow(screen_name)
retweet(tweet_cid)
tweet(myinput)
#call what u want to do here
#fav(tweet_cid)
#retweet(tweet_cid)
return True
except Exception as e:
print (str(e)) # prints the error message, if you dont want it to comment it out.
pass
def on_error(self, status_code):
try:
print( "error" + status_code)
except Exception as e:
print(str(e))
pass
def retweet(tweet_cid):
try:
api.retweet(tweet_cid)
time.sleep(random.randit(range(50,900)))
except Exception as e:
print(str(e))
pass
def follow(screen_name):
try:
api.create_friendship(screen_name)
time.sleep(random.randit(range(50,900)))
except Exception as e:
print(str(e))
pass
def fav(tweet_cid):
try:
api.create_favourite(tweet_cid)
time.sleep(random.randit(range(600,1100)))
except Exception as e:
print(str(e))
pass
def unfav(tweet_cid):
try:
api.destroy_tweet(tweet_cid)
time.sleep(random.randit(range(8000,9000)))
except Exception as e:
print(str(e))
pass
def tweet(myinput):
try:
api.update_status(myinput)
time.sleep(random.randit(range(1000,4000)))
except Exception as e:
print(str(e))
pass
# tags below
track_words = [""] #deleted all tags so easier to read
follow_acc = [] # all username converted to user ids
try:
twt = Stream(auths, listener())
twt.filter(track=track_words, follow = follow_acc)
except Exception as e:
print (str(e))
pass
Is this what you are asking for? It gives the stack trace of the exception.
import traceback
try:
s='hi'
s=s+1
except Exception as e:
print(traceback.format_exc())
Output:
Traceback (most recent call last):
File "<stdin>", line 3, in <module>
TypeError: cannot concatenate 'str' and 'int' objects
Hope this helps! :)

How to properly debug ThreadPool?

I'm trying to get some data from a web page. To speed up this process (they allow me to make 1000 requests per minute), I use ThreadPool.
Since there is a huge amount of data, the process is quite vulnerable to connection fails etc. so I try to log everything I can to be able to detect each mistake I did in code.
The problem is that program sometimes just stops without any exception (it acts like it is running but with no effect - I use PyCharm). I log catched exceptions everywhere I can but I can't see any exception in any log.
I assume that if there were a timeout reached, the exception would be raised and logged.
I've found out where the problem could be. Here is the code:
As a pool, I use: from multiprocessing.pool import ThreadPool as Pool
And lock: from threading import Lock
The download_category function is being used in loop.
def download_category(url):
# some code
#
# ...
log('Create pool...')
_pool = Pool(_workers_number)
with open('database/temp_produkty.txt') as f:
log('Spracovavanie produktov... vytvaranie vlakien...') # I see this in log
for url_product in f:
x = _pool.apply_async(process_product, args=(url_product.strip('\n'), url))
_pool.close()
_pool.join()
log('Presuvanie produktov z temp export do export.csv...') # I can't see this in log
temp_export_to_export_csv()
set_spracovanie_kategorie(url)
except Exception as e:
logging.exception('Got exception on download_one_category: {}'.format(url))
And process_product function:
def process_product(url, cat):
try:
data = get_product_data(url)
except:
log('{}: {} exception while getting product data... #') # I don't see this in log
return
try:
print_to_temp_export(data, cat) # I don't see this in log
except:
log('{}: {} exception while printing to csv... #') # I don't see this in log
raise
LOG function:
def log(text):
now = datetime.now().strftime('%d.%m.%Y %H:%M:%S')
_lock.acquire()
mLib.printToFile('logging/log.log', '{} -> {}'.format(now, text))
_lock.release()
I use logging module too. In this log, I see that probably 8 (number of workers) times request was sent but no answer hasn't been recieved.
EDIT1:
def get_product_data(url):
data = defaultdict(lambda: '-')
root = load_root(url)
try:
nazov = root.xpath('//h1[#itemprop="name"]/text()')[0]
except:
nazov = root.xpath('//h1/text()')[0]
under_block = root.xpath('//h2[#id="lowest-cost"]')
if len(under_block) < 1:
under_block = root.xpath('//h2[contains(text(),"Naj")]')
if len(under_block) < 1:
return False
data['nazov'] = nazov
data['url'] = url
blocks = under_block[0].xpath('./following-sibling::div[#class="shp"]/div[contains(#class,"shp")]')
i = 0
for block in blocks:
i += 1
data['dat{}_men'.format(i)] = eblock.xpath('.//a[#class="link"]/text()')[0]
del root
return data
LOAD ROOT:
class RedirectException(Exception):
pass
def load_url(url):
r = requests.get(url, allow_redirects=False)
if r.status_code == 301:
raise RedirectException
if r.status_code == 404:
if '-q-' in url:
url = url.replace('-q-','-')
mLib.printToFileWOEncoding('logging/neexistujuce.txt','Skusanie {} kategorie...'.format(url))
return load_url(url) # THIS IS NOT LOOPING
else:
mLib.printToFileWOEncoding('logging/neexistujuce.txt','{}'.format(url))
html = r.text
return html
def load_root(url):
try:
html = load_url(url)
except Exception as e:
logging.exception('load_root_exception')
raise
return etree.fromstring(html, etree.HTMLParser())

Store and compare last two lines using response.iter_lines()

I have a rate stream where I need to store and compare the last two lines. For instance if the new price is higher than the previous, queue event. It's my understanding that iter_lines()only displays the last line. My question is how could I store the last line, wait for a new line and compare those, then queue the event? I know this is simple, but I'm still having trouble, thanks for your help!
Here is my UPDATED(3) stream:
def stream_to_queue(self):
response = self.connect_to_stream()
if response.status_code != 200:
return
oldLine = ''
for line in response.iter_lines(1):
if line < oldLine:
try:
msg = json.loads(line)
except Exception as e:
print "Caught exception when converting message into json\n" + str(e)
return
if msg.has_key("instrument") or msg.has_key("tick"):
print msg["tick"]
instrument = msg["tick"]["instrument"]
time = msg["tick"]["time"]
bid = msg["tick"]["bid"]
ask = msg["tick"]["ask"]
stop = msg["tick"]["ask"]
tev = TickEvent(instrument, time, bid, ask)
self.events_queue.put(tev)
oldLine = line
The original function:
def stream_to_queue(self):
response = self.connect_to_stream()
if response.status_code != 200:
return
for line in response.iter_lines(1):
if line:
try:
msg = json.loads(line)
except Exception as e:
print "Caught exception when converting message into json\n" + str(e)
return
if msg.has_key("instrument") or msg.has_key("tick"):
print msg["tick"]
instrument = msg["tick"]["instrument"]
time = msg["tick"]["time"]
bid = msg["tick"]["bid"]
ask = msg["tick"]["ask"]
stop = msg["tick"]["ask"]
The repaired function:
def stream_to_queue(self):
response = self.connect_to_stream()
if response.status_code != 200:
return
last_msg = None # new line
for line in response.iter_lines(1):
if line:
try:
msg = json.loads(line)
if last_msg is None: # new line
last_msg = msg # new line
except Exception as e:
print "Caught exception when converting message into json\n" + str(e)
return
# can now compare last msg with current msg
if msg.has_key("instrument") or msg.has_key("tick"):
print msg["tick"]
instrument = msg["tick"]["instrument"]
time = msg["tick"]["time"]
bid = msg["tick"]["bid"]
ask = msg["tick"]["ask"]
stop = msg["tick"]["ask"]
last_msg = msg # new line (may want to indent 4 more spaces)
It may make sense to move the if last_msg is None check to the inside of if msg.has_key block if you want the last_msg to have certain information.

python 3 - urllib issue

I'm using python 3.3.0 in Windows 7.
I have two files: dork.txt and fuzz.py
dork.txt contains following:
/about.php?id=1
/en/company/news/full.php?Id=232
/music.php?title=11
fuzz.py contains following:
srcurl = "ANY-WEBSITE"
drkfuz = open("dorks.txt", "r").readlines()
print("\n[+] Number of dork names to be fuzzed:",len(drkfuz))
for dorks in drkfuz:
dorks = dorks.rstrip("\n")
srcurl = "http://"+srcurl+dorks
requrl = urllib.request.Request(srcurl)
#httpreq = urllib.request.urlopen(requrl)
# Starting the request
try:
httpreq = urllib.request.urlopen(requrl)
except urllib.error.HTTPError as e:
print ("[!] Error code: ", e.code)
print("")
#sys.exit(1)
except urllib.error.URLError as e:
print ("[!] Reason: ", e.reason)
print("")
#sys.exit(1)
#if e.code != 404:
if httpreq.getcode() == 200:
print("\n*****srcurl********\n",srcurl)
return srcurl
So, when I enter the correct website name which has /about.php?id=1, it works fine.
But when I provide the website which has /en/company/news/full.php?Id=232, it first
prints Error code: 404 and then gives me the following error: UnboundLocalError: local
variable 'e' referenced before assignment or UnboundLocalError: local variable 'httpreq' referenced before assignment
I can understand that if the website doesn't have the page which contains /about.php?id=1, it gives Error code: 404 but why it's not going back in the for loop to check the remaining dorks in the text file??? Why it stops here and throws an error?
I want to make a script to find out valid page from just a website address like: www.xyz.com
When the line urllib.request.urlopen(requrl) expression throws an exception, the variable httpreq is never set. You could set it to None before the try statement, then test if it is still None afterwards:
httpreq = None
try:
httpreq = urllib.request.urlopen(requrl)
# ...
if httpreq is not None and httpreq.getcode() == 200:
srcurl = "ANY-WEBSITE"
drkfuz = open("dorks.txt", "r").readlines()
print("\n[+] Number of dork names to be fuzzed:",len(drkfuz))
for dorks in drkfuz:
dorks = dorks.rstrip("\n")
srcurl = "http://"+srcurl+dorks
try:
requrl = urllib.request.Request(srcurl)
if requrl != None and len(requrl) > 0:
try:
httpreq = urllib.request.urlopen(requrl)
if httpreq.getcode() == 200:
print("\n*****srcurl********\n",srcurl)
return srcurl
except:
# Handle exception
pass
except:
# Handle your exception
print "Exception"
Untested code, but it will work logically.

Categories

Resources