twitter scraping. how to cumulative re-execution on the current result - python

I'm scraping Twitter now. (Python)
It is a crawl that extracts tweets in real time with famous keywords.
If you do not have a tweet with keywords, it will be terminated.
(It has been successful so far.)
I wanted to create code that automatically executes after I enter the code, not when I quit by hand.
(Cumulative re-execution on the current result)
My first thought was to create a repeatable crawl code every 12 hours.
But I can not run it ... I thought I had to set the code that I created to end when it was 12 hours.
(Repeat every 12 hours -> 12 hours to pause and run again)
I also think that it is not cumulative, but only tweets that are duplicated from the beginning.
I am writing this article to get advice on my code or advice on my thoughts
import tweepy
import time
import os
import json
import simplejson
API_key = "x"
API_secret = "x"
Access_token = "x"
Access_token_secret = "x"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
search_term = 'x'
search_term2= 'x'
search_term3='x'
search_term4='x'
search_term5='x'
lat = "x"
lon = "x"
radius = "x"
location = "%s,%s,%s" % (lat, lon, radius)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3, search_term4, search_term5),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(0.35)
wfile = open(os.getcwd()+"/wtt2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
wfile.close()
from apscheduler.schedulers.blocking import BlockingScheduler
sched = BlockingScheduler()
#sched.scheduled_job('interval', hours=12)
def timed_job():
print('This job is run every 12 hours.')
sched.configure(options_from_ini_file)
sched.start()

Related

Tweepy returns same tweets when scraping data repeatedly

I am scraping data from Twitter for tweets, since Twitter has a limitation on this, I am scraping 2500 tweets data every 15 minutes, however, I observe that each run after 15 minutes is returning me the same tweets. Is there any way how I can skip the previously scraped tweet data using some offset.
Thank You!
Here is my code:
# Import libraries
from tweepy import OAuthHandler
#from tweepy.streaming import StreamListener
import tweepy
import csv
import pandas as pd
#import re
#from textblob import TextBlob
#import string
#import preprocessor as p
#import os
import time
# Twitter credentials
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''
# Pass your twitter credentials to tweepy via its OAuthHandler
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
def extract_tweets(search_words,date_since,numTweets):
return(tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets))
def scrapetweets(search_words, date_since, numTweets, numRuns):
# Define a pandas dataframe to store the date:
db_tweets = pd.DataFrame(columns = ['username', 'acctdesc', 'location', 'following', 'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts', 'retweetcount', 'text', 'hashtags'])
#db_tweets = pd.DataFrame()
for i in range(numRuns):
tweets = extract_tweets(search_words,date_since,numTweets)
# Store these tweets into a python list
tweet_list = [tweet for tweet in tweets]
print(len(tweet_list))
noTweets = 0
for tweet in tweet_list:
username = tweet.user.screen_name
acctdesc = tweet.user.description
location = tweet.user.location
following = tweet.user.friends_count
followers = tweet.user.followers_count
totaltweets = tweet.user.statuses_count
usercreatedts = tweet.user.created_at
tweetcreatedts = tweet.created_at
retweetcount = tweet.retweet_count
hashtags = tweet.entities['hashtags']
lst=[]
for h in hashtags:
lst.append(h['text'])
try:
text = tweet.retweeted_status.full_text
except AttributeError: # Not a Retweet
text = tweet.full_text
itweet = [username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,lst]
db_tweets.loc[len(db_tweets)] = itweet
noTweets += 1
print(noTweets,itweet)
#filename = "tweets.csv"
#with open(filename, "a", newline='') as fp:
# wr = csv.writer(fp, dialect='excel')
# wr.writerow(itweet)
print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
if i+1 != numRuns:
time.sleep(920)
filename = "tweets.csv"
# Store dataframe in csv with creation date timestamp
db_tweets.to_csv(filename, mode='a', index = False)
# Initialise these variables:
search_words = "#India OR #COVID-19"
date_since = "2020-04-29"
#date_until = "2020-05-01"
numTweets = 2500
numRuns = 10
# Call the function scrapetweets
program_start = time.time()
scrapetweets(search_words, date_since, numTweets, numRuns)
program_end = time.time()
print('Scraping has completed!')
print('Total time taken to scrape is {} minutes.'.format(round(program_end - program_start)/60, 2))
I referred to a blog on medium for this purpose.
you can add a variable as validator an store it to a file that may be a tweetid.txt
and each time you run the script, you open di tweetid.txt
if tweetid same in tweet id in txt, you pass it.

Run python script every hour

I want scheduled to run my python script every hour and save the data in elasticsearch index. So that I used a function I wrote, set_interval which uses the tweepy library. But it doesn't work as I need it to work. It runs every minute and save the data in index. Even after the set that seconds equal to 3600 it runs in every minute. But I want to configure this to run on an hourly basis.
How can I fix this? Heres my python script:
def call_at_interval(time, callback, args):
while True:
timer = Timer(time, callback, args=args)
timer.start()
timer.join()
def set_interval(time, callback, *args):
Thread(target=call_at_interval, args=(time, callback, args)).start()
def get_all_tweets(screen_name):
# authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
screen_name = ""
# initialize a list to hold all the tweepy Tweets
alltweets = []
# make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name=screen_name, count=200)
# save most recent tweets
alltweets.extend(new_tweets)
# save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
# keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#print
#"getting tweets before %s" % (oldest)
# all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest)
# save most recent tweets
alltweets.extend(new_tweets)
# update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#print
#"...%s tweets downloaded so far" % (len(alltweets))
outtweets = [{'ID': tweet.id_str, 'Text': tweet.text, 'Date': tweet.created_at, 'author': tweet.user.screen_name} for tweet in alltweets]
def save_es(outtweets, es): # Peps8 convention
data = [ # Please without s in data
{
"_index": "index name",
"_type": "type name",
"_id": index,
"_source": ID
}
for index, ID in enumerate(outtweets)
]
helpers.bulk(es, data)
save_es(outtweets, es)
print('Run at:')
print(datetime.now())
print("\n")
set_interval(3600, get_all_tweets(screen_name))
Why do you need so much complexity to do some task every hour? You can run script every one hour this way below, note that it is runned 1 hour + time to do work:
import time
def do_some_work():
print("Do some work")
time.sleep(1)
print("Some work is done!")
if __name__ == "__main__":
time.sleep(60) # imagine you would like to start work in 1 minute first time
while True:
do_some_work()
time.sleep(3600) # do work every one hour
If you want to run script exactly every one hour, do the following code below:
import time
import threading
def do_some_work():
print("Do some work")
time.sleep(4)
print("Some work is done!")
if __name__ == "__main__":
time.sleep(60) # imagine you would like to start work in 1 minute first time
while True:
thr = threading.Thread(target=do_some_work)
thr.start()
time.sleep(3600) # do work every one hour
In this case thr is supposed to finish it's work faster than 3600 seconds, though it does not, you'll still get results, but results will be from another attempt, see the example below:
import time
import threading
class AttemptCount:
def __init__(self, attempt_number):
self.attempt_number = attempt_number
def do_some_work(_attempt_number):
print(f"Do some work {_attempt_number.attempt_number}")
time.sleep(4)
print(f"Some work is done! {_attempt_number.attempt_number}")
_attempt_number.attempt_number += 1
if __name__ == "__main__":
attempt_number = AttemptCount(1)
time.sleep(1) # imagine you would like to start work in 1 minute first time
while True:
thr = threading.Thread(target=do_some_work, args=(attempt_number, ),)
thr.start()
time.sleep(1) # do work every one hour
The result you'll gey in the case is:
Do some work 1
Do some work 1
Do some work 1
Do some work 1
Some work is done! 1
Do some work 2
Some work is done! 2
Do some work 3
Some work is done! 3
Do some work 4
Some work is done! 4
Do some work 5
Some work is done! 5
Do some work 6
Some work is done! 6
Do some work 7
Some work is done! 7
Do some work 8
Some work is done! 8
Do some work 9
I like using subprocess.Popen for such tasks, if the child subprocess did not finish it's work within one hour due to any reason, you just terminate it and start a new one.
You also can use CRON to schedule some process to run every one hour.
Get rid of all timer code just write the logic and
cron will do the job for you add this to the end of the file after crontab -e
0 * * * * /path/to/python /path/to/script.py
0 * * * * means run at every zero minute you can find more explanation here
And also I noticed you are recursively calling get_all_tweets(screen_name) I think you might have to call it from outside
Just keep your script this much
def get_all_tweets(screen_name):
# authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
screen_name = ""
# initialize a list to hold all the tweepy Tweets
alltweets = []
# make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name=screen_name, count=200)
# save most recent tweets
alltweets.extend(new_tweets)
# save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
# keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#print
#"getting tweets before %s" % (oldest)
# all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest)
# save most recent tweets
alltweets.extend(new_tweets)
# update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#print
#"...%s tweets downloaded so far" % (len(alltweets))
outtweets = [{'ID': tweet.id_str, 'Text': tweet.text, 'Date': tweet.created_at, 'author': tweet.user.screen_name} for tweet in alltweets]
def save_es(outtweets, es): # Peps8 convention
data = [ # Please without s in data
{
"_index": "index name",
"_type": "type name",
"_id": index,
"_source": ID
}
for index, ID in enumerate(outtweets)
]
helpers.bulk(es, data)
save_es(outtweets, es)
get_all_tweets("") #your screen name here

How can I python twitter crawling (scraping) several keyword

I wrote the code.
But I don't think it's going to work.
I want to extract words from the concept of " or " rather than the concept of " and ".
It seems like only ' keyword 1 ' is extracted.
How do I make corrections?
import tweepy
import time
import os
search_term = 'keyword1'
search_term2= 'keyword2'
lat = "37.6"
lon = "127.0"
radius = "200km"
location = "%s,%s,%s" % (lat, lon, radius)
API_key = "11111"
API_secret = "22222"
Access_token = "33333"
Access_token_secret = "444"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q=(search_term or search_term2),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(1)
wfile = open(os.getcwd()+"/twtw2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
time.sleep(1)
wfile.close()
Maybe change this line
q=(search_term or search_term2),
to
q="{}+OR+{}".format(search_term,search_term2),
Case matters here for the OR operator
enter q as a string, not as an expression that is short-circuit evaluated
By the way, your credentials (from your post) also work for me.

How can I save crawling(scraping,streaming) result?

crwaling(scraping,streaming) result is very good
ex. 973 : {'text': 'RT #1111: hihihihihihi' }
BUT! Unable to save.
How do I fix it?
import tweepy
import time
import os
import json
import simplejson
search_term = '5555'
search_term2= '4444'
search_term3='3333'
search_term4='2222'
search_term5='1111'
lat = "11.11"
lon = "11.11"
radius = "100km"
API_key = "0"
API_secret = "0"
Access_token = "0"
Access_token_secret = "0"
location = "%s,%s,%s" % (lat, lon, radius)
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3, search_term4, search_term5),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
time.sleep(0.4)
i += 1
txt file is not made. ->There is no error message.
Or, txt file is made. but, there is no "tweet text and tweet date" in the txt. ->There is no error message.
(Not necessarily a txt file.Save Excel file.)
wfile = open(os.getcwd()+"/tqtq.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
data['date']= tweet.text
wfile.write(data['text','date']+'\n')
i += 1
time.sleep(0.4)
wfile.close()
You may try using pickle
import pickle
pickle.dump(obj, filename)
To load it back result = pickle.load(filename)

Twython Rate Limit Issue

I am wondering how I can automate my program to fetch tweets at the max rate of 180 requests per 15 minutes, which is equivalent to the max count of 100 per request totaling 18,000 tweets. I am creating this program for an independent case study at school.
I would like my program to avoid being rate limited and end up being terminated. So, what I would like it to do is constantly use the max number of requests per 15 minutes and be able to leave it running for 24 hours without user interaction to retrieve all tweets possible for analysis.
Here is my code. It gets tweets of query and puts it into a text file but eventually gets rate limited. Would really appreciate the help
import logging
import time
import csv
import twython
import json
app_key = ""
app_secret = ""
oauth_token = ""
oauth_token_secret = ""
twitter = twython.Twython(app_key, app_secret, oauth_token, oauth_token_secret)
tweets = []
MAX_ATTEMPTS = 1000000
# Max Number of tweets per 15 minutes
COUNT_OF_TWEETS_TO_BE_FETCHED = 18000
for i in range(0,MAX_ATTEMPTS):
if(COUNT_OF_TWEETS_TO_BE_FETCHED < len(tweets)):
break
if(0 == i):
results = twitter.search(q="$AAPL",count='100',lang='en',)
else:
results = twitter.search(q="$AAPL",include_entities='true',max_id=next_max_id)
for result in results['statuses']:
print result
with open('tweets.txt', 'a') as outfile:
json.dump(result, outfile, sort_keys = True, indent = 4)
try:
next_results_url_params = results['search_metadata']['next_results']
next_max_id = next_results_url_params.split('max_id=')[1].split('&')[0]
except:
break
You should be using Twitter's Streaming API.
This will allow you to receive a near-realtime feed of your search. You can write those tweets to a file just as fast as they come in.
Using the track parameter you will be able to receive only the specific tweets you're interested in.
You'll need to use Twython Streamer - and your code will look something like this:
from twython import TwythonStreamer
class MyStreamer(TwythonStreamer):
def on_success(self, data):
if 'text' in data:
print data['text'].encode('utf-8')
def on_error(self, status_code, data):
print status_code
stream = MyStreamer(APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
stream.statuses.filter(track='$AAPL')

Categories

Resources