I am a Python newbie and am trying to print error messages when using Tweepy to stream tweets. I used an endless loop in my streaming code because it generates InComplete Read errors otherwise. My aim is to print all the error messages I get while continuing to stream tweets, so that I am aware of errors other than the InComplete Read errors.
My streamListerner is:
# Code from http://badhessian.org/2012/10/collecting-real-time-twitter-data- with-the-streaming-api/ with minor modifications
import json, time, sys
from tweepy import StreamListener
# create an instance of a tweepy StreamListener to handle the incoming data.
class SListener(StreamListener):
def __init__(self, fprefix = 'streamer'):
# self.api = api or API()
self.counter = 0
self.fprefix = fprefix
self.output = open('../Dissertation/stream_3_data/' + fprefix + '.' + time.strftime('%Y%m%d-%H%M%S') + '.json', 'w')
self.delout = open('delete.txt', 'a')
def on_data(self, data):
if 'in_reply_to_status' in data:
self.on_status(data)
elif 'delete' in data:
delete = json.loads(data)['delete']['status']
if self.on_delete(delete['id'], delete['user_id']) is False:
return False
elif 'limit' in data:
if self.on_limit(json.loads(data)['limit']['track']) is False:
return False
elif 'warning' in data:
warning = json.loads(data)['warnings']
print warning['message']
return False
def on_status(self, status):
self.output.write(status)
self.counter += 1
if self.counter >= 5000: # New file is started every 5,000 tweets, tagged with prefix and a timestamp.
self.output.close()
self.output = open('../Dissertation/stream_3_data/' + self.fprefix + '.'
+ time.strftime('%Y%m%d-%H%M%S') + '.json', 'w')
self.counter = 0
return
def on_delete(self, status_id, user_id):
self.delout.write( str(status_id) + "\n")
return
def on_limit(self, track):
sys.stderr.write(track + "\n")
return
def on_error(self, status_code):
sys.stderr.write('Error: ' + str(status_code) + "\n")
return True # Don't kill the stream
def on_timeout(self):
sys.stderr.write("Timeout, sleeping for 60 seconds...\n")
time.sleep(60)
return True # Don't kill the stream
The part that seems to generate problems is when I try to use the streamlistener:
twitter_api = tweepy_oauth()
Q = "twitter.com"
locations = [101.615161,3.08115,101.753663,3.167507,
115.421372,39.43277,117.501099,41.05999,
120.858322,30.69094,121.9733,31.86889]
# Create a streaming API and set a timeout value of 60 seconds.
streaming_api = tweepy.streaming.Stream(twitter_api, SListener(), timeout=60)
# Used infinite loop from https://github.com/ryanmcgrath/twython/issues/288 cause
# I kept getting InComplete Read Error. Probably due to high volumes of tweets being sent to me at once
#Endless loop
while True:
try:
streaming_api.filter(follow=None, track=None, locations=locations, stall_warnings=True)
except:
e = sys.exc_info()[0] #Get exception info
print 'ERROR:',e #Print exception info
continue
My code does run and works, but I encounter the following error occasionally, which stops my entire stream:
---------------------------------------------------------------------------
IOError Traceback (most recent call last)
<ipython-input-4-fb45fa5d8307> in <module>()
34 streaming_api.filter(follow=None, track=None, locations=locations, stall_warnings=True)
35 except:
36 e = sys.exc_info()[0] #Get exception info
---> 37 print 'ERROR:',e #Print exception info
38 continue
IOError: [Errno 22] Invalid argument
The timing when the error appears is inconsistent - it ranges from 1h into the stream to an entire day into the stream.
I concluded that the issue is with the print statement because I replaced line 37 with
print 'Error'
and the same error message appears. I am not sure how to proceed when even the basic print statement does not work - any help would be great.
Related
i have this problem with this code:
class TTTEngine:
def __init__(self):
self.logger = logging
def recognize_input(self):
self.logger.info("Waiting for user input.")
text_transcript = input('>> ').lower()
while text_transcript == '':
self.logger.info("User didn't said something")
text_transcript = input('>> ').lower()
return text_transcript
I get this error with module 'logging' when i type a word in console input:
'<' not supported between instances of 'builtin_function_or_method' and 'int'
I think than the try/except that is logging errors is this:
try:
while not self.message_queue.empty():
cumulative_batch = ''
message = self.message_queue.get()
batches = self._create_text_batches(raw_text=message)
for batch in batches:
self.tts_engine.say(batch)
cumulative_batch += batch
self.console_manager.console_output(cumulative_batch)
self.run_engine()
if self.stop_speaking:
self.logger.debug('Speech interruption triggered')
self.stop_speaking = False
break
except Exception as e:
self.logger.error("Speech and console error message: {0}".format(e))
This is tts.py complete file:
import threading
import logging
import pyttsx3
import queue
class TTS:
"""
Text To Speech Engine (TTS)
"""
def __init__(self):
self.tts_engine = self._set_voice_engine()
def run_engine(self):
try:
self.tts_engine.runAndWait()
except RuntimeError:
pass
#staticmethod
def _set_voice_engine():
"""
Setup text to speech engine
:return: gtts engine object
"""
tts_engine = pyttsx3.init()
tts_engine.setProperty('rate', 160) # Setting up new voice rate
tts_engine.setProperty('volume', 1.0) # Setting up volume level between 0 and 1
return tts_engine
class TTSEngine(TTS):
def __init__(self, console_manager, speech_response_enabled):
super().__init__()
self.logger = logging
self.message_queue = queue.Queue(maxsize=5) # Maxsize is the size of the queue / capacity of messages
self.console_manager = console_manager
self.speech_response_enabled = speech_response_enabled
self.stop_speaking = False
def assistant_response(self, message):
"""
Assistant response in voice or/and in text.
:param message: string
"""
if self.speech_response_enabled:
self._insert_into_message_queue(message)
try:
speech_tread = threading.Thread(target=self._speech_and_console)
speech_tread.start()
except RuntimeError as e:
self.logger.error('Error in assistant response thread with message {0}'.format(e))
def _insert_into_message_queue(self, message):
try:
if message:
self.message_queue.put(message)
except Exception as e:
self.logger.error("Unable to insert message to queue with error message: {0}".format(e))
def _speech_and_console(self):
"""
Speech method translate text batches to speech and print them in the console.
:param text: string (e.g 'tell me about google')
"""
try:
while not self.message_queue.empty():
cumulative_batch = ''
message = self.message_queue.get()
batches = self._create_text_batches(raw_text=message)
for batch in batches:
self.tts_engine.say(batch)
cumulative_batch += batch
self.console_manager.console_output(cumulative_batch)
self.run_engine()
if self.stop_speaking:
self.logger.debug('Speech interruption triggered')
self.stop_speaking = False
break
except Exception as e:
self.logger.error("Speech and console error message: {0}".format(e))
#staticmethod
def _create_text_batches(raw_text, number_of_words_per_batch=8):
"""
Splits the user speech message into batches and return a list with the split batches
:param raw_text: string
:param number_of_words_per_batch: int
:return: list
"""
raw_text = raw_text + ' '
list_of_batches = []
total_words = raw_text.count(' ')
letter_id = 0
for split in range(0, int(total_words / number_of_words_per_batch)):
batch = ''
words_count = 0
while words_count < number_of_words_per_batch:
batch += raw_text[letter_id]
if raw_text[letter_id] == ' ':
words_count += 1
letter_id += 1
list_of_batches.append(batch)
if letter_id < len(raw_text): # Add the rest of word in a batch
list_of_batches.append(raw_text[letter_id:])
return list_of_batches
i need a script to make it like a cpanel checker, with more than 1 url and the url is stored in a txt file.
usage : python script.py list.txt
format in file list.txt : https://demo.cpanel.net:2083|democom|DemoCoA5620
this is my code but it doesn't work, can someone help me?
Thanks.
import requests, sys
from multiprocessing.dummy import Pool as ThreadPool
try:
with open(sys.argv[1], 'r') as f:
list_data = [line.strip() for line in f if line.strip()]
except IOError:
pass
def cpanel(url):
try:
data = {'user':'democom', 'pass':'DemoCoA5620'}
r = requests.post(url, data=data)
if r.status_code==200:
print "login success"
else:
print "login failed"
except:
pass
def chekers(url):
try:
cpanel(url)
except:
pass
def Main():
try:
start = timer()
pp = ThreadPool(25)
pr = pp.map(chekers, list_data)
print('Time: ' + str(timer() - start) + ' seconds')
except:
pass
if __name__ == '__main__':
Main()
I fixed your code in a way that it will return an actual array containing a boolean array indicating the success of the cpanel function.
from __future__ import print_function
import requests
from multiprocessing.pool import ThreadPool
try:
list_data = ["https://demo.cpanel.net:2083|democom|DemoCoA5620",
"https://demo.cpanel.net:2083|UserDoesNotExist|WRONGPASSWORD",
]
except IOError:
pass
def cpanel(url):
try:
# try to split that url to get username / password
try:
url, username, password = url.split('|')
except Exception as e:
print("Url {} seems to have wrong format. Concrete error: {}".format(url, e))
return False
# build the correct url
url += '/login/?login_only=1'
# build post parameters
params = {'user': username,
'pass': password}
# make request
r = requests.post(url, params)
if r.status_code==200:
print("login for user {} success".format(username))
return True
else:
print("login for user {} failed due to Status Code {} and message \"{}\"".format(username, r.status_code, r.reason))
return False
except Exception as e:
print("Error occured for url {} ".format(e))
return False
def chekers(url):
return cpanel(url)
def Main():
try:
# start = timer()
pp = ThreadPool(1)
pr = pp.map(chekers, list_data)
print(pr)
# print('Time: ' + str(timer() - start) + ' seconds')
except:
pass
if __name__ == '__main__':
Main()
Output:
login for user democom success
login for user UserDoesNotExist failed due to Status Code 401 and message "Access Denied"
[True, False]
Be aware that I replaced your file read operation by some fixed urls.
Since you use request.post I guess you actually want to POST something to that urls. Your code does not do that. If you just want to send a request, use the requests.get method.
See the official documentation for the requests packet: https://2.python-requests.org/en/master/user/quickstart/#make-a-request for more details.
Also note that
"but it doesn't work"
is NOT a question.
I'm trying to get some data from a web page. To speed up this process (they allow me to make 1000 requests per minute), I use ThreadPool.
Since there is a huge amount of data, the process is quite vulnerable to connection fails etc. so I try to log everything I can to be able to detect each mistake I did in code.
The problem is that program sometimes just stops without any exception (it acts like it is running but with no effect - I use PyCharm). I log catched exceptions everywhere I can but I can't see any exception in any log.
I assume that if there were a timeout reached, the exception would be raised and logged.
I've found out where the problem could be. Here is the code:
As a pool, I use: from multiprocessing.pool import ThreadPool as Pool
And lock: from threading import Lock
The download_category function is being used in loop.
def download_category(url):
# some code
#
# ...
log('Create pool...')
_pool = Pool(_workers_number)
with open('database/temp_produkty.txt') as f:
log('Spracovavanie produktov... vytvaranie vlakien...') # I see this in log
for url_product in f:
x = _pool.apply_async(process_product, args=(url_product.strip('\n'), url))
_pool.close()
_pool.join()
log('Presuvanie produktov z temp export do export.csv...') # I can't see this in log
temp_export_to_export_csv()
set_spracovanie_kategorie(url)
except Exception as e:
logging.exception('Got exception on download_one_category: {}'.format(url))
And process_product function:
def process_product(url, cat):
try:
data = get_product_data(url)
except:
log('{}: {} exception while getting product data... #') # I don't see this in log
return
try:
print_to_temp_export(data, cat) # I don't see this in log
except:
log('{}: {} exception while printing to csv... #') # I don't see this in log
raise
LOG function:
def log(text):
now = datetime.now().strftime('%d.%m.%Y %H:%M:%S')
_lock.acquire()
mLib.printToFile('logging/log.log', '{} -> {}'.format(now, text))
_lock.release()
I use logging module too. In this log, I see that probably 8 (number of workers) times request was sent but no answer hasn't been recieved.
EDIT1:
def get_product_data(url):
data = defaultdict(lambda: '-')
root = load_root(url)
try:
nazov = root.xpath('//h1[#itemprop="name"]/text()')[0]
except:
nazov = root.xpath('//h1/text()')[0]
under_block = root.xpath('//h2[#id="lowest-cost"]')
if len(under_block) < 1:
under_block = root.xpath('//h2[contains(text(),"Naj")]')
if len(under_block) < 1:
return False
data['nazov'] = nazov
data['url'] = url
blocks = under_block[0].xpath('./following-sibling::div[#class="shp"]/div[contains(#class,"shp")]')
i = 0
for block in blocks:
i += 1
data['dat{}_men'.format(i)] = eblock.xpath('.//a[#class="link"]/text()')[0]
del root
return data
LOAD ROOT:
class RedirectException(Exception):
pass
def load_url(url):
r = requests.get(url, allow_redirects=False)
if r.status_code == 301:
raise RedirectException
if r.status_code == 404:
if '-q-' in url:
url = url.replace('-q-','-')
mLib.printToFileWOEncoding('logging/neexistujuce.txt','Skusanie {} kategorie...'.format(url))
return load_url(url) # THIS IS NOT LOOPING
else:
mLib.printToFileWOEncoding('logging/neexistujuce.txt','{}'.format(url))
html = r.text
return html
def load_root(url):
try:
html = load_url(url)
except Exception as e:
logging.exception('load_root_exception')
raise
return etree.fromstring(html, etree.HTMLParser())
I'm trying to get the follower count of companies and track it over time. I have over 200 000 companies so the code I currently have would literally take years to run with current api limit.
c = tweepy.Cursor(api.followers_ids, id = a)
ids = []
for id in c.items():
time.sleep(0.01)
ids.append(id) '
In this code its one api hit for every follower. I was wondering if there is a function that just gives a follower count as a number? Also what is the twitter api limit?
Each API requests returns at most 5000 followers IDs at a time, to retrieve all the followers of the 200 000 companies, here is a very useful script from the book Mining the social web by Matthew A. Russell to solve the twitter api limit
to make robust twitter request and to access twitter's API Matthew defined these methods :
import sys
import time
from urllib2 import URLError
from httplib import BadStatusLine
import json
import twitter
def oauth_login():
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
return twitter_api
def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw):
# A nested helper function that handles common HTTPErrors. Return an updated
# value for wait_period if the problem is a 500 level error. Block until the
# rate limit is reset if it's a rate limiting issue (429 error). Returns None
# for 401 and 404 errors, which requires special handling by the caller.
def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True):
if wait_period > 3600: # Seconds
print >> sys.stderr, 'Too many retries. Quitting.'
raise e
# See https://dev.twitter.com/docs/error-codes-responses for common codes
if e.e.code == 401:
print >> sys.stderr, 'Encountered 401 Error (Not Authorized)'
return None
elif e.e.code == 404:
print >> sys.stderr, 'Encountered 404 Error (Not Found)'
return None
elif e.e.code == 429:
print >> sys.stderr, 'Encountered 429 Error (Rate Limit Exceeded)'
if sleep_when_rate_limited:
print >> sys.stderr, "Retrying in 15 minutes...ZzZ..."
sys.stderr.flush()
time.sleep(60*15 + 5)
print >> sys.stderr, '...ZzZ...Awake now and trying again.'
return 2
else:
raise e # Caller must handle the rate limiting issue
elif e.e.code in (500, 502, 503, 504):
print >> sys.stderr, 'Encountered %iError. Retrying in %iseconds' %\
(e.e.code, wait_period)
time.sleep(wait_period)
wait_period *= 1.5
return wait_period
else:
raise e
# End of nested helper function
wait_period = 2
error_count = 0
while True:
try:
return twitter_api_func(*args, **kw)
except twitter.api.TwitterHTTPError, e:
error_count = 0
wait_period = handle_twitter_http_error(e, wait_period)
if wait_period is None:
return
except URLError, e:
error_count += 1
print >> sys.stderr, "URLError encountered. Continuing."
if error_count > max_errors:
print >> sys.stderr, "Too many consecutive errors...bailing out."
raise
except BadStatusLine, e:
error_count += 1
print >> sys.stderr, "BadStatusLine encountered. Continuing."
if error_count > max_errors:
print >> sys.stderr, "Too many consecutive errors...bailing out."
raise
here is the methods to retrieve the friends and the followers :
from functools import partial
from sys import maxint
def get_friends_followers_ids(twitter_api, screen_name=None, user_id=None,
friends_limit=maxint, followers_limit=maxint):
# Must have either screen_name or user_id (logical xor)
assert (screen_name != None) != (user_id != None),\
"Must have screen_name or user_id, but not both"
# See https://dev.twitter.com/docs/api/1.1/get/friends/ids and
# https://dev.twitter.com/docs/api/1.1/get/followers/ids for details
# on API parameters
get_friends_ids = partial(make_twitter_request, twitter_api.friends.ids,
count=5000)
get_followers_ids = partial(make_twitter_request,twitter_api.followers.ids,
count=5000)
friends_ids, followers_ids = [], []
for twitter_api_func, limit, ids, label in [
[get_friends_ids, friends_limit, friends_ids, "friends"],
[get_followers_ids, followers_limit, followers_ids, "followers"]
]:
if limit == 0: continue
cursor = -1
while cursor != 0:
# Use make_twitter_request via the partially bound callable...
if screen_name:
response = twitter_api_func(screen_name=screen_name, cursor=cursor)
else: # user_id
response = twitter_api_func(user_id=user_id, cursor=cursor)
if response is not None:
ids += response['ids']
cursor = response['next_cursor']
print >> sys.stderr, 'Fetched {0} total {1} ids for{2}'.format(len(ids),
label, (user_id or screen_name))
# XXX: You may want to store data during each iteration to provide
# an additional layer of protection from exceptional circumstances
if len(ids) >= limit or response is None:
break
# Do something useful with the IDs, like store them to disk...
return friends_ids[:friends_limit], followers_ids[:followers_limit]
# Sample usage
twitter_api = oauth_login()
friends_ids, followers_ids =get_friends_followers_ids(twitter_api,
screen_name="SocialWebMining",
friends_limit=10,
followers_limit=10)
print friends_ids
print followers_ids
I am using tweepy and python to gather tweets based on certain keywords and then writing those status updates (tweets) to a CSV file. I do not consider myself a programmer and I am really lost on this.
Here is the Error:
> Traceback (most recent call last):
File "./combined-tweepy.py", line 58, in <module>
sapi.filter(track=[topics])
File "/usr/local/lib/python2.7/dist-packages/tweepy/streaming.py", line 286, in filter
encoded_track = [s.encode(encoding) for s in track]
AttributeError: 'tuple' object has no attribute 'encode'
Here is the script:
#!/usr/bin/python
import sys
import re
import tweepy
import codecs
import datetime
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
# Create a list of topics
with open('termList.txt', 'r') as f:
topics = [line.strip() for line in f]
stamp = datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')
topicFile = open(stamp + '.csv', 'w+')
sapi = tweepy.streaming.Stream(auth, CustomStreamListener(topicFile))
sapi.filter(track=[topics])
class CustomStreamListener(tweepy.StreamListener):
def __init__(self, output_file, api=None):
super(CustomStreamListener, self).__init__()
self.num_tweets = 0
self.output_file = output_file
def on_status(self, status):
### Writes one tweet per line in the CSV file
cleaned = status.text.replace('\'','').replace('&','').replace('>','').replace(',','').replace("\n",'')
self.num_tweets = self.num_tweets + 1
if self.num_tweets < 500:
self.output_file.write(status.user.location.encode("UTF-8") + ',' + cleaned.encode("UTF-8") + "\n")
print ("capturing tweet from list")
# print status.user.location
return True
else:
return False
sys.exit("terminating")
def on_error(self, status_code):
print >> sys.stderr, 'Encountered error with status code:', status_code
return True # Don't kill the stream
def on_timeout(self):
print >> sys.stderr, 'Timeout...'
return True #Don't kill the stream
f.close()
Here's the definition of a tuple according to Python's documentation. It seems like one of the words in topics is a tuple.
I see other little errors. First, the way you wrote your code, you should call your functions after you have defined them. For example, these two lines
sapi = tweepy.streaming.Stream(auth, CustomStreamListener(topicFile))
sapi.filter(track=[topics])
should come after you have defined all the functions in
class CustomStreamListener(tweepy.StreamListener):
Also, there's no need to put topics in braces
sapi.filter(track=[topics])
since it's already a list according to this line
topics = [line.strip() for line in f]
Can you show us the content of termList.txt?