How can I python twitter crawling (scraping) several keyword - python

I wrote the code.
But I don't think it's going to work.
I want to extract words from the concept of " or " rather than the concept of " and ".
It seems like only ' keyword 1 ' is extracted.
How do I make corrections?
import tweepy
import time
import os
search_term = 'keyword1'
search_term2= 'keyword2'
lat = "37.6"
lon = "127.0"
radius = "200km"
location = "%s,%s,%s" % (lat, lon, radius)
API_key = "11111"
API_secret = "22222"
Access_token = "33333"
Access_token_secret = "444"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q=(search_term or search_term2),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(1)
wfile = open(os.getcwd()+"/twtw2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
time.sleep(1)
wfile.close()

Maybe change this line
q=(search_term or search_term2),
to
q="{}+OR+{}".format(search_term,search_term2),
Case matters here for the OR operator
enter q as a string, not as an expression that is short-circuit evaluated
By the way, your credentials (from your post) also work for me.

Related

Tweepy search_full_archive() missing 2 required positional arguments: 'label' and 'query'

I'm using Tweepy 3.10.0 to collect tweets containing specific keywords and hashtags for a single calendar day at a time. I recently upgraded from the standard Developer Account to the Premium Account to access the full archive. I know this changes the "search" function to "search_full_archive" and changes a couple other small syntax things. I thought I made the correct changes but I'm still getting this error. I've checked the Developer API reference.
consumer_key = '****'
consumer_secret = '****'
access_token = '****'
access_token_secret = '****'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)
def get_tweets_withHashTags(query, startdate, enddate, count = 300):
tweets_hlist= []
tweets_list= []
qt=str(query)
for page in tweepy.Cursor(api.search_full_archive, environment_name='FullArchive', q=qt, fromDate=startdate,toDate=enddate,count=300, tweet_mode='extended').pages(100):
count = len(page)
print( "Count of tweets in each page for " + str(qt) + " : " + str(count))
for value in page:
hashList = value._json["entities"]["hashtags"]
flag = 0
for tag in hashList:
if qt.lower() in tag["text"].lower():
flag = 1
if flag==1:
tweets_hlist.append(value._json)
tweets_list.append(value._json)
print("tweets_hash_"+ query +": " + str(len(tweets_hlist)))
print("tweets_"+ query +": " + str(len(tweets_list)))
with open("/Users/Victor/Documents/tweetCollection/data/"+startdate +"/" + "query1_hash_" + str(startdate)+ "_" + str(enddate) + "_" +query+'.json', 'w') as outfile:
json.dump(tweets_hlist, outfile, indent = 2)
with open("/Users/Victor/Documents/tweetCollection/data/"+startdate +"/"+"query1_Contains_" + str(startdate)+ "_" + str(enddate) + "_" +query+'.json', 'w') as outfile:
json.dump(tweets_list, outfile, indent = 2)
return len(tweets_list)
query = ["KeyWord1","KeyWord2","KeyWord3",etc.]
for value in query:
get_tweets_withHashTags(value,"2020-04-21","2020-04-22")
According to the api's code https://github.com/tweepy/tweepy/blob/5b2dd086c2c5a08c3bf7be54400adfd823d19ea1/tweepy/api.py#L1144 api.search_full_archive has as arguments label (the environment name) and query. So changing
api.search_full_archive, environment_name='FullArchive', q=qt, fromDate=startdate,toDate=enddate,count=300, tweet_mode='extended'
to
api.search_full_archive, label='FullArchive', query=qt, fromDate=startdate,toDate=enddate
As for the tweet_mode='extended', it is not available for search_full_archive nor search_30_day. You can see how to access full text in https://github.com/tweepy/tweepy/issues/1461

why python code does not retrieve Arabic tweets but works with other languages? why does it return random characters?

my code is in python3 and I used it before to live stream tweets in English. However the same code when searching for an Arabic query, it returns all tweets in symbols and random characters.here is a screenshot, and the code. (ps: I am a beginner in coding) (thank you!) here is my code:
import twitter,json,csv
CONSUMER_KEY = '<consumer key>'
CONSUMER_SECRET = '<consumer secret>'
OAUTH_TOKEN = '<oauth token>'
OAUTH_TOKEN_SECRET = '<oauth token secret>'
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
# setup a file to write to
csvfile = open('tweets_extended.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter='|')
# heres a function that takes out characters that can break
# our import into Excel and replaces them with spaces
# it also does the unicode bit
def getVal(val):
clean = ""
if val:
val = val.replace('|', ' ')
val = val.replace('\n', ' ')
val = val.replace('\r', ' ')
clean = val.encode('utf-8')
return clean
q = "سلمان" # Comma-separated list of terms can go here
print ('Filtering the public timeline for track="%s"' % (q,))
twitter_stream = twitter.TwitterStream(auth=twitter_api.auth)
stream = twitter_stream.statuses.filter(track=q)
for tweet in stream:
try:
if tweet['truncated']:
tweet_text = tweet['extended_tweet']['full_text']
else:
tweet_text = tweet['text']
# write the values to file
csvwriter.writerow([
tweet['created_at'],
getVal(tweet['user']['screen_name']),
getVal(tweet_text),
getVal(tweet['user']['location']),
tweet['user']['statuses_count'],
tweet['user']['followers_count'],
tweet['user']['lang'],
tweet['user']['id'],
])
# print something to the screen, mostly so we can see what is going on...
print (tweet['user']['screen_name'].encode('utf-8'), tweet['text'].encode('utf-8'))
except Exception as err:
print (err)
pass

Tweepy still not returning full text despite using extended text feature

I am using tweepy to download tweets about a particular topic but nobody which tutorial I follow I cannot get the tweet to output as a full tweet. There is always an ellipse that cuts it off after a certain number of characters.
Here is the code I am using
import json
import tweepy
from tweepy import OAuthHandler
import csv
import sys
from twython import Twython
nonBmpMap = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
with open ('Twitter_Credentials.json') as cred_data:
info = json.load(cred_data)
consumer_Key = info['Consumer_Key']
consumer_Secret = info['Consumer_Secret']
access_Key = info['Access_Key']
access_Secret = info['Access_Secret']
maxTweets = int(input('Enter the Number of tweets that you want to extract '))
userTopic = input('What topic do you want to search for ')
topic = ('"' + userTopic + '"')
tweetCount = 0
auth = OAuthHandler(consumer_Key, consumer_Secret)
auth.set_access_token(access_Key, access_Secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
tweets = api.search(q=topic, count=maxTweets, tweet_mode= 'extended')
for tweet in tweets:
tweetCount = (tweetCount+1)
with open ('TweetsAbout' + userTopic, 'a', encoding='utf-8') as the_File:
print(tweet.full_text.translate(nonBmpMap))
tweet = (str(tweet.full_text).translate(nonBmpMap).replace(',','').replace('|','').replace('\n','').replace('’','\'').replace('…',"end"))
the_File.write(tweet + "\n")
print('Extracted ' + str(tweetCount) + ' tweets about ' + topic)
Try this, see if it works!
try:
specific_tweets = tweepy.Cursor(api.search, tweet_mode='extended', q=<your_query_string> +" -filter:retweets", lang='en').items(500)
except tweepy.error.TweepError:
pass
for tweet in specific_tweets:
extracted_text = tweet.full_text
all the text your trying to extract should be in extracted_text. Good Luck!!

twitter scraping. how to cumulative re-execution on the current result

I'm scraping Twitter now. (Python)
It is a crawl that extracts tweets in real time with famous keywords.
If you do not have a tweet with keywords, it will be terminated.
(It has been successful so far.)
I wanted to create code that automatically executes after I enter the code, not when I quit by hand.
(Cumulative re-execution on the current result)
My first thought was to create a repeatable crawl code every 12 hours.
But I can not run it ... I thought I had to set the code that I created to end when it was 12 hours.
(Repeat every 12 hours -> 12 hours to pause and run again)
I also think that it is not cumulative, but only tweets that are duplicated from the beginning.
I am writing this article to get advice on my code or advice on my thoughts
import tweepy
import time
import os
import json
import simplejson
API_key = "x"
API_secret = "x"
Access_token = "x"
Access_token_secret = "x"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
search_term = 'x'
search_term2= 'x'
search_term3='x'
search_term4='x'
search_term5='x'
lat = "x"
lon = "x"
radius = "x"
location = "%s,%s,%s" % (lat, lon, radius)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3, search_term4, search_term5),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(0.35)
wfile = open(os.getcwd()+"/wtt2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
wfile.close()
from apscheduler.schedulers.blocking import BlockingScheduler
sched = BlockingScheduler()
#sched.scheduled_job('interval', hours=12)
def timed_job():
print('This job is run every 12 hours.')
sched.configure(options_from_ini_file)
sched.start()

How can I save crawling(scraping,streaming) result?

crwaling(scraping,streaming) result is very good
ex. 973 : {'text': 'RT #1111: hihihihihihi' }
BUT! Unable to save.
How do I fix it?
import tweepy
import time
import os
import json
import simplejson
search_term = '5555'
search_term2= '4444'
search_term3='3333'
search_term4='2222'
search_term5='1111'
lat = "11.11"
lon = "11.11"
radius = "100km"
API_key = "0"
API_secret = "0"
Access_token = "0"
Access_token_secret = "0"
location = "%s,%s,%s" % (lat, lon, radius)
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3, search_term4, search_term5),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
time.sleep(0.4)
i += 1
txt file is not made. ->There is no error message.
Or, txt file is made. but, there is no "tweet text and tweet date" in the txt. ->There is no error message.
(Not necessarily a txt file.Save Excel file.)
wfile = open(os.getcwd()+"/tqtq.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
data['date']= tweet.text
wfile.write(data['text','date']+'\n')
i += 1
time.sleep(0.4)
wfile.close()
You may try using pickle
import pickle
pickle.dump(obj, filename)
To load it back result = pickle.load(filename)

Categories

Resources