I have a Python code that extracts Twitter data via the streaming API. I would like to use separate files for each day so I would like to have the script running for 24 hours, then kill it and restart it as with a restart of the program the name of the file will change.
How can I ensure that the script is stopped at 00:00 and restarts right away?
The code can be found below. If you have any other ideas about how I can create a new text file daily, this would be even better.
import tweepy
import datetime
key_words = ["xx"]
twitter_data_title = "".join([xx, "_", date_today, ".txt"])
class TwitterStreamer():
def __init__(self):
pass
def stream_tweets(self, twitter_data_title, key_words):
listener = StreamListener(twitter_data_title)
auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_secret_token)
stream = tweepy.Stream(auth, listener)
stream.filter(track=key_words)
class StreamListener(tweepy.StreamListener):
def __init__(self, twitter_data_title):
self.fetched_tweets_filename = twitter_data_title
def on_data(self, data):
try:
print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
tf.write(data)
return True
except BaseException as e:
print("Error on_data %s" % str(e))
return True
def on_exception(self, exception):
print('exception', exception)
stream_tweets(twitter_data_title, key_words)
def on_error(self, status):
print(status)
def stream_tweets(twitter_data_title, key_words):
listener = StreamListener(twitter_data_title)
auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_secret_token)
stream = tweepy.Stream(auth, listener)
stream.filter(track=key_words)
if __name__ == '__main__':
twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets(twitter_data_title, key_words)
It looks like the 'blocking' code in your example comes from another library, so you don't have the opportunity to (easily) change the inner loop to check for a condition and exit.
Using a Background Process (Not Ideal)
You could change your entry point to start the code in a background process, and check to see if the file's title should have changed:
from multiprocessing import Process
from time import sleep
...
if __name__ == "__main__":
twitter_streamer = TwitterStreamer()
twitter_data_title, process = None, None
while True:
new_data_title = "".join([xx, "_", str(datetime.date.today()), ".txt"])
if new_data_title == twitter_data_title: # Nothing to do.
sleep(60) # Sleep for a minute
continue # And check again
# Set the new title.
twitter_data_title = new_data_title
# If the process is already running, terminate and join it.
if process is not None:
process.terminate()
process.join()
process = Process(target=twitter_streamer.stream_tweets, args=[twitter_data_title, key_words])
process.start()
Changing StreamListener
A better alternative would probably be to encode the knowledge of the date into StreamListener. Instead of passing a file name (twitter_data_title), pass a file prefix (xx from your example), and build the filename in a property:
...
class StreamListener(tweepy.StreamListener):
def __init__(self, file_prefix):
self.prefix = file_prefix
#property
def fetched_tweets_filename(self):
"""The file name for the tweets."""
date = datetime.date.today()
return f"{self.prefix}_{date}.txt"
...
...
if __name__ == "__main__":
twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets(xx, key_words)
Since StreamListener.on_data grabs the file name from self.fetched_tweets_filename, this should mean the tweets are written to the new file when the date changes.
I would add this to your code:
from threading import Timer
def stopTheScript():
exec(open("anotherscript.py").read())
exit()
Timer(86400, stopTheScript).start() #86400 s = 24 h
Related
I have some function that is doing stuff in while True (for example just print a str):
class LoopHandler(metaclass=Singleton):
def start_loop(self, phrase: str):
while True:
print(phrase)
time.sleep(1)
And I have a simple FastAPI server running in parallel:
class APIServer(uvicorn.Server):
def install_signal_handlers(self):
pass
#contextlib.contextmanager
def run_in_thread(self):
thread = threading.Thread(target=self.run)
thread.start()
try:
while not self.started:
time.sleep(1e-3)
yield
except KeyboardInterrupt:
self.should_exit = True
thread.join()
And it works fine, text prints, API works.
But the task is to "restart" loop when specific API method is called. Something like this:
#app.get("/get")
async def get():
response = {'response': 'response'}
# restart loop here with LoopHandler().start_loop('another Text')
return response
Thanks for any advice!
My main():
if __name__ == '__main__':
config = uvicorn.Config("api_view:app", log_level="debug")
server = APIServer(config=config)
with server.run_in_thread():
LoopHandler().start_loop('Text')
ADD:
When I call LoopHandler from API, It begin new thread and new LoopHandler instance in it. So, if I add flag, like this:
class LoopHandler(metaclass=Singleton):
def __init__(self, done: bool = False):
self.done = done
def start_loop(self, phrase: str):
while not self.done:
print(phrase)
time.sleep(1)
My console looks like this:
Text
another Text
Text
another Text
Text
I have a simple set of objects for managing a background process using the Actor model. In this case I'm concerned with only a single actor. However, it is important that the actor maintains a persistent state between receiving messages.
The objects work by appending messages to a queue in the main thread. Then the main thread can execute as it pleases. Every once in awhile it checks to see if anything new is on the results queue. When this happens it knows the actor has completed the task.
I want to know if this be implemented in a cleaner way using Futures objects. My current implementation is as follows:
import multiprocessing
import time
import collections
class Client(object):
"""
Object used in the main thread to communicate with background actors
"""
def __init__(client):
client.manager = None
client.start()
def __del__(client):
if client.manager and client.manager.is_alive():
client.get(StopIteration)
def start(client):
client.task_queue = multiprocessing.JoinableQueue()
client.result_queue = multiprocessing.Queue()
client.result_history = collections.deque(maxlen=1000)
client.manager = Manager(client.task_queue, client.result_queue)
client.manager.start()
def post(client, payload):
client.task_queue.put(payload)
def get(client, payload):
# Exhaust any existing results
list(client.results())
# Post the command
client.post(payload)
# Wait for a response
result = client.wait_for_result()
return result
def wait_for_result(client):
wait = 0
while True:
for result in client.results():
return result
time.sleep(wait)
wait = max(1, wait + .01)
def results(client):
""" Look at results put on the result_queue """
while not client.result_queue.empty():
item = client.result_queue.get()
client.result_history.append(item)
yield item
class Manager(multiprocessing.Process):
"""
Manager manages a single actor.
A manager sends messages an actor and appends a response when it is done.
"""
def __init__(self, task_queue, result_queue):
super(Manager, self).__init__()
self.task_queue = task_queue
self.result_queue = result_queue
def run(self):
""" main loop """
terminate = False
# Create Actor in separate process and send messages to it
actor = Actor()
while not terminate:
message = self.task_queue.get()
print('Sending message={} to actor'.format(message))
try:
if message is StopIteration:
content = 'shutdown'
terminate = True
else:
content = actor.handle(message)
except Exception as ex:
print('Error handling message')
status = 'error'
content = repr(ex)
else:
status = 'success'
print('Actor finished handling message={}'.format(message))
# Send back result
response = {
'status': status,
'content': content
}
self.task_queue.task_done()
self.result_queue.put(response)
print('Manager is shutting down')
class Actor(object):
"""
An actor is given messages from its manager and performs actions in a
single thread. Its state is private and threadsafe.
"""
def __init__(actor):
actor.state = {}
def handle(actor, message):
if not isinstance(message, dict):
raise ValueError('Commands must be passed in a message dict')
message = message.copy()
action = message.pop('action', None)
if action is None:
raise ValueError('message must have an action item')
if action == 'hello world':
content = 'hello world'
return content
elif action == 'debug':
return actor
elif action == 'start':
actor.state['a'] = 3
return 'started'
elif action == 'add':
for i in range(10000000):
actor.state['a'] += 1
return 'added', actor.state['a']
else:
raise ValueError('Unknown action=%r' % (action,))
def test():
print('Starting Test')
client = Client()
print('About to send messages')
# Get sends a message and then blocks until the response is returned.
print(client.get({'action': 'hello world'}))
print(client.get({'action': 'start'}))
print(client.get({'action': 'add'}))
print('Test completed')
if __name__ == '__main__':
test()
I would like to modify this code to use Future objects. Whenever the client is about to send a message, is it possible to create a Future object, then send that over the multiprocessing queue? Then the manager could execute the actors function and then modify the state of the Future object instead of appending a result to the result_queue.
This seems like it would offer a cleaner way to associate results with messages sent to the actor. It would also remove the need for the get and results methods I have in the first example.
Intuitively, I want it to look something like this:
from concurrent import futures
import multiprocessing
class Client(object):
"""
Object used in the main thread to communicate with background actors
"""
def __init__(client):
client.manager = None
client.start()
def __del__(client):
if client.manager and client.manager.is_alive():
f = client.post(StopIteration)
def start(client):
client.task_queue = multiprocessing.JoinableQueue()
client.manager = Manager(client.task_queue)
client.manager.start()
def post(client, payload):
f = futures.Future()
client.task_queue.put((f, payload))
return f
class Manager(multiprocessing.Process):
"""
Manager manages a single actor.
"""
def __init__(self, task_queue):
super(Manager, self).__init__()
self.task_queue = task_queue
def run(self):
""" main loop """
terminate = False
# Create Actor in separate process and send messages to it
actor = Actor()
while not terminate:
f, message = self.task_queue.get()
f.set_running_or_notify_cancel()
print('Sending message={} to actor'.format(message))
try:
if message is StopIteration:
content = 'shutdown'
terminate = True
else:
content = actor.handle(message)
except Exception as ex:
print('Error handling message')
status = 'error'
content = repr(ex)
else:
status = 'success'
print('Actor finished handling message={}'.format(message))
# Send back result
response = {
'status': status,
'content': content
}
self.task_queue.task_done()
f.set_result(response)
print('Manager is shutting down')
class Actor(object):
"""
An actor is given messages from its manager and performs actions in a
single thread. Its state is private and threadsafe.
"""
def __init__(actor):
actor.state = {}
def handle(actor, message):
if not isinstance(message, dict):
raise ValueError('Commands must be passed in a message dict')
message = message.copy()
action = message.pop('action', None)
if action is None:
raise ValueError('message must have an action item')
if action == 'hello world':
content = 'hello world'
return content
elif action == 'debug':
return actor
elif action == 'start':
actor.state['a'] = 3
return 'started'
elif action == 'add':
for i in range(10000000):
actor.state['a'] += 1
return 'added', actor.state['a']
else:
raise ValueError('Unknown action=%r' % (action,))
def test():
print('Starting Test')
client = Client()
print('About to send messages')
f1 = client.post({'action': 'hello world'})
print(f1.result())
f2 = client.post({'action': 'start'})
print(f2.result())
f3 = client.post({'action': 'add'})
print(f3.result())
print('Test completed')
if __name__ == '__main__':
test()
However, this obviously doesn't execute correctly. I believe I need some sort of process pool manager to create the futures for me (because I'm calling methods that are documented saying that only the pool manager should call them). But I'm not quite sure how to go about doing that. I've used futures before to map singleton worker functions, but I've never managed an external process with state before.
Can someone help me out with this? Perhaps there is an even easier way to go about implementing this with Futures?
So, I went ahead and just made a library to do this:
https://github.com/Erotemic/futures_actors
I am trying to stream twitter data for a period of time of say 5 minutes, using the Stream.filter() method. I am storing the retrieved tweets in a JSON file. The problem is I am unable to stop the filter() method from within the program. I need to stop the execution manually. I tried stopping the data based on system time using the time package. I was able to stop writing tweets to the JSON file but the stream method is still going on, but It was not able to continue to the next line of code.
I am using IPython notebook to write and execute the code.
Here's the code:
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)
from tweepy import Stream
from tweepy.streaming import StreamListener
class MyListener(StreamListener):
def __init__(self, start_time, time_limit=60):
self.time = start_time
self.limit = time_limit
def on_data(self, data):
while (time.time() - self.time) < self.limit:
try:
saveFile = open('abcd.json', 'a')
saveFile.write(data)
saveFile.write('\n')
saveFile.close()
return True
except BaseException as e:
print 'failed ondata,', str(e)
time.sleep(5)
return True
def on_status(self, status):
if (time.time() - self.time) >= self.limit:
print 'time is over'
return false
def on_error(self, status):
if (time.time() - self.time) >= self.limit:
print 'time is over'
return false
else:
print(status)
return True
start_time = time.time()
stream_data = Stream(auth, MyListener(start_time,20))
stream_data.filter(track=['name1','name2',...list ...,'name n'])#list of the strings I want to track
These links are similar but I does not answer my question directly
Tweepy: Stream data for X minutes?
Stopping Tweepy steam after a duration parameter (# lines, seconds, #Tweets, etc)
Tweepy Streaming - Stop collecting tweets at x amount
I used this link as my reference,
http://stats.seandolinar.com/collecting-twitter-data-using-a-python-stream-listener/
In order to close the stream you need to return False from on_data(), or on_status().
Because tweepy.Stream() runs a while loop itself, you don't need the while loop in on_data().
When initializing MyListener, you didn't call the parent's class __init__ method, so it wasn't initialized properly.
So for what you're trying to do, the code should be something like:
class MyStreamListener(tweepy.StreamListener):
def __init__(self, time_limit=60):
self.start_time = time.time()
self.limit = time_limit
self.saveFile = open('abcd.json', 'a')
super(MyStreamListener, self).__init__()
def on_data(self, data):
if (time.time() - self.start_time) < self.limit:
self.saveFile.write(data)
self.saveFile.write('\n')
return True
else:
self.saveFile.close()
return False
myStream = tweepy.Stream(auth=api.auth, listener=MyStreamListener(time_limit=20))
myStream.filter(track=['test'])
Access the variable myListener.running but instead of passing MyListener directly to Stream create a variable as follows:
myListener = MyListener()
timeout code here... suchas time.sleep(20)
myListener.running = False
So, I was having this issue as well. Fortunately Tweepy is open source so it's easy so dig into the problem.
Basically the important part is this here:
def _data(self, data):
if self.listener.on_data(data) is False:
self.running = False
On Stream class in streaming.py
That means, to close the connection you just have to return false on the listener's on_data() method.
For those who are trying with Twitter api V2 (StreamingClient class), here is the solution:
client.disconnect()
I'm trying to do the following, Start stream listener in a separate thread that will create a queues, Than those queue will be processed later on... however Storm doesn't do anything after thread. It stuck over there.
And my code looks like this:
import os, sys, traceback, random, StringIO, time
import random
from uuid import uuid4
from select import select
from subprocess import Popen,PIPE
import pyinotify
import simplejson, pycurl
import sys, signal
import twitter
import tweepy
import Queue
import threading
try:
import simplejson as json
except ImportError:
import json
import storm
queue = Queue.Queue()
class MyModelParser(tweepy.parsers.ModelParser):
def parse(self, method, payload):
result = super(MyModelParser, self).parse(method, payload)
result._payload = json.loads(payload)
return result
class CustomStreamListener(tweepy.StreamListener):
''' Handles data received from the stream. '''
def __init__(self, api, q):
self.api = api
self.queue = q
self.queue.put('lalala')
def on_status(self, status):
self.queue.put('%s' % status.author.screen_name)
self.queue.task_done()
def on_error(self, status_code):
return True # To continue listening
def on_timeout(self):
return True # To continue listening
class Starter():
def __init__(self,q):
self.queue = q
hashtag = ['justinbieber','snooki','daddy_yankee','MikeTyson','iamdiddy','lala']
auth = self.t_auth()
api = tweepy.API(auth, parser=MyModelParser())
stream = tweepy.streaming.Stream(auth,CustomStreamListener(api,queue))
stream.filter(follow=None, track=hashtag)
def t_auth(self):
consumer_key=""
consumer_secret=""
access_key = ""
access_secret = ""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
return auth
class TwitterSpout(storm.Spout):
SPOUT_NAME = "TwitterSpout"
queue = queue
def initialize(self, conf, context):
self.pid = os.getpid()
try:
t = threading.Thread(target=Starter(self.queue) )
t.daemon=True
t.start()
except KeyboardInterrupt, e:
self.log('\n\nStopping')
raise
Use pyleus(https://github.com/Yelp/pyleus) and your spout implementation should have next_tuple(self): which should emit the output fields as in the example below;
from pyleus.storm import Spout
class DummySpout(Spout):
OUTPUT_FIELDS = ['sentence', 'name']
def initialize(self):
pass
def next_tuple(self):
self.emit(("This is a sentence.", "spout",))
if __name__ == '__main__':
DummySpout().run()
then write your bolt;
from pyleus.storm import SimpleBolt
class DummyBolt(SimpleBolt):
OUTPUT_FIELDS = ['sentence']
def process_tuple(self, tup):
sentence, name = tup.values
new_sentence = "{0} says, \"{1}\"".format(name, sentence)
self.emit((new_sentence,), anchors=[tup])
if __name__ == '__main__':
DummyBolt().run()
you can also have a look at how i am using it;
https://github.com/Yelp/pyleus/issues/140
I'm trying to get the clipboard content using a Python script on my Mac Lion.
I'm searching for an event or something similar, because if I use a loop, my application spends all its time watching the clipboard.
Any ideas?
Have you thought about using an endless loop and "sleeping" between tries?
I used pyperclip for a simple PoC and it worked like a charm, and Windows and Linux.
import time
import sys
import os
import pyperclip
recent_value = ""
while True:
tmp_value = pyperclip.paste()
if tmp_value != recent_value:
recent_value = tmp_value
print("Value changed: %s" % str(recent_value)[:20])
time.sleep(0.1)
Instead of the print, do whatever you want.
Here is a complete multithreading example.
import time
import threading
import pyperclip
def is_url_but_not_bitly(url):
if url.startswith("http://") and not "bit.ly" in url:
return True
return False
def print_to_stdout(clipboard_content):
print ("Found url: %s" % str(clipboard_content))
class ClipboardWatcher(threading.Thread):
def __init__(self, predicate, callback, pause=5.):
super(ClipboardWatcher, self).__init__()
self._predicate = predicate
self._callback = callback
self._pause = pause
self._stopping = False
def run(self):
recent_value = ""
while not self._stopping:
tmp_value = pyperclip.paste()
if tmp_value != recent_value:
recent_value = tmp_value
if self._predicate(recent_value):
self._callback(recent_value)
time.sleep(self._pause)
def stop(self):
self._stopping = True
def main():
watcher = ClipboardWatcher(is_url_but_not_bitly,
print_to_stdout,
5.)
watcher.start()
while True:
try:
print("Waiting for changed clipboard...")
time.sleep(10)
except KeyboardInterrupt:
watcher.stop()
break
if __name__ == "__main__":
main()
I create a subclass of threading.Thread, override the methods run and __init__ and create an instance of this class. By calling watcher.start() (not run()!), you start the thread.
To safely stop the thread, I wait for <Ctrl>-C (keyboard interrupt) and tell the thread to stop itself.
In the initialization of the class, you also have a parameter pause to control how long to wait between tries.
Use the class ClipboardWatcher like in my example, replace the callback with what you do, e.g., lambda x: bitly(x, username, password).
Looking at pyperclip the meat of it on Macosx is :
import os
def macSetClipboard(text):
outf = os.popen('pbcopy', 'w')
outf.write(text)
outf.close()
def macGetClipboard():
outf = os.popen('pbpaste', 'r')
content = outf.read()
outf.close()
return content
These work for me how do you get on?
I don't quite follow your comment on being in a loop.
EDIT Added 'orrid polling example that shows how changeCount() bumps up on each copy to the pasteboard. It's still not what the OP wants as there seems no event or notification for modifications to the NSPasteboard.
from LaunchServices import *
from AppKit import *
import os
from threading import Timer
def poll_clipboard():
pasteboard = NSPasteboard.generalPasteboard()
print pasteboard.changeCount()
def main():
while True:
t = Timer(1, poll_clipboard)
t.start()
t.join()
if __name__ == "__main__":
main()
simple!
import os
def macSetClipboard(text):
outf = os.popen('pbcopy', 'w')
outf.write(text)
outf.close()
def macGetClipboard():
outf = os.popen('pbpaste', 'r')
content = outf.read()
outf.close()
return content
current_clipboard = macGetClipboard()
while True:
clipboard = macGetClipboard()
if clipboard != current_clipboard:
print(clipboard)
macSetClipboard("my new string")
print(macGetClipboard())
break
I originaly posted my answer on a duplicate Run a python code when copying text with specific keyword
Here the answer I came up with.
import clipboard
import asyncio
# Exemple function.
async def your_function():
print("Running...")
async def wait4update(value):
while True:
if clipboard.paste() != value : # If the clipboard changed.
return
async def main():
value = clipboard.paste() # Set the default value.
while True :
update = asyncio.create_task(wait4update(value))
await update
value = clipboard.paste() # Change the value.
asyncio.create_task(your_function()) #Start your function.
asyncio.run(main())