How can I save crawling(scraping,streaming) result?

How can I save crawling(scraping,streaming) result? - python

crwaling(scraping,streaming) result is very good
ex. 973 : {'text': 'RT #1111: hihihihihihi' }
BUT! Unable to save.
How do I fix it?
import tweepy
import time
import os
import json
import simplejson
search_term = '5555'
search_term2= '4444'
search_term3='3333'
search_term4='2222'
search_term5='1111'
lat = "11.11"
lon = "11.11"
radius = "100km"
API_key = "0"
API_secret = "0"
Access_token = "0"
Access_token_secret = "0"
location = "%s,%s,%s" % (lat, lon, radius)
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3, search_term4, search_term5),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
time.sleep(0.4)
i += 1
txt file is not made. ->There is no error message.
Or, txt file is made. but, there is no "tweet text and tweet date" in the txt. ->There is no error message.
(Not necessarily a txt file.Save Excel file.)
wfile = open(os.getcwd()+"/tqtq.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
data['date']= tweet.text
wfile.write(data['text','date']+'\n')
i += 1
time.sleep(0.4)
wfile.close()

You may try using pickle
import pickle
pickle.dump(obj, filename)
To load it back result = pickle.load(filename)

Related

why python code does not retrieve Arabic tweets but works with other languages? why does it return random characters?

my code is in python3 and I used it before to live stream tweets in English. However the same code when searching for an Arabic query, it returns all tweets in symbols and random characters.here is a screenshot, and the code. (ps: I am a beginner in coding) (thank you!) here is my code:
import twitter,json,csv
CONSUMER_KEY = '<consumer key>'
CONSUMER_SECRET = '<consumer secret>'
OAUTH_TOKEN = '<oauth token>'
OAUTH_TOKEN_SECRET = '<oauth token secret>'
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
# setup a file to write to
csvfile = open('tweets_extended.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter='|')
# heres a function that takes out characters that can break
# our import into Excel and replaces them with spaces
# it also does the unicode bit
def getVal(val):
clean = ""
if val:
val = val.replace('|', ' ')
val = val.replace('\n', ' ')
val = val.replace('\r', ' ')
clean = val.encode('utf-8')
return clean
q = "سلمان" # Comma-separated list of terms can go here
print ('Filtering the public timeline for track="%s"' % (q,))
twitter_stream = twitter.TwitterStream(auth=twitter_api.auth)
stream = twitter_stream.statuses.filter(track=q)
for tweet in stream:
try:
if tweet['truncated']:
tweet_text = tweet['extended_tweet']['full_text']
else:
tweet_text = tweet['text']
# write the values to file
csvwriter.writerow([
tweet['created_at'],
getVal(tweet['user']['screen_name']),
getVal(tweet_text),
getVal(tweet['user']['location']),
tweet['user']['statuses_count'],
tweet['user']['followers_count'],
tweet['user']['lang'],
tweet['user']['id'],
])
# print something to the screen, mostly so we can see what is going on...
print (tweet['user']['screen_name'].encode('utf-8'), tweet['text'].encode('utf-8'))
except Exception as err:
print (err)
pass

Tweepy returns same tweets when scraping data repeatedly

I am scraping data from Twitter for tweets, since Twitter has a limitation on this, I am scraping 2500 tweets data every 15 minutes, however, I observe that each run after 15 minutes is returning me the same tweets. Is there any way how I can skip the previously scraped tweet data using some offset.
Thank You!
Here is my code:
# Import libraries
from tweepy import OAuthHandler
#from tweepy.streaming import StreamListener
import tweepy
import csv
import pandas as pd
#import re
#from textblob import TextBlob
#import string
#import preprocessor as p
#import os
import time
# Twitter credentials
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''
# Pass your twitter credentials to tweepy via its OAuthHandler
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
def extract_tweets(search_words,date_since,numTweets):
return(tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets))
def scrapetweets(search_words, date_since, numTweets, numRuns):
# Define a pandas dataframe to store the date:
db_tweets = pd.DataFrame(columns = ['username', 'acctdesc', 'location', 'following', 'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts', 'retweetcount', 'text', 'hashtags'])
#db_tweets = pd.DataFrame()
for i in range(numRuns):
tweets = extract_tweets(search_words,date_since,numTweets)
# Store these tweets into a python list
tweet_list = [tweet for tweet in tweets]
print(len(tweet_list))
noTweets = 0
for tweet in tweet_list:
username = tweet.user.screen_name
acctdesc = tweet.user.description
location = tweet.user.location
following = tweet.user.friends_count
followers = tweet.user.followers_count
totaltweets = tweet.user.statuses_count
usercreatedts = tweet.user.created_at
tweetcreatedts = tweet.created_at
retweetcount = tweet.retweet_count
hashtags = tweet.entities['hashtags']
lst=[]
for h in hashtags:
lst.append(h['text'])
try:
text = tweet.retweeted_status.full_text
except AttributeError: # Not a Retweet
text = tweet.full_text
itweet = [username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,lst]
db_tweets.loc[len(db_tweets)] = itweet
noTweets += 1
print(noTweets,itweet)
#filename = "tweets.csv"
#with open(filename, "a", newline='') as fp:
# wr = csv.writer(fp, dialect='excel')
# wr.writerow(itweet)
print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
if i+1 != numRuns:
time.sleep(920)
filename = "tweets.csv"
# Store dataframe in csv with creation date timestamp
db_tweets.to_csv(filename, mode='a', index = False)
# Initialise these variables:
search_words = "#India OR #COVID-19"
date_since = "2020-04-29"
#date_until = "2020-05-01"
numTweets = 2500
numRuns = 10
# Call the function scrapetweets
program_start = time.time()
scrapetweets(search_words, date_since, numTweets, numRuns)
program_end = time.time()
print('Scraping has completed!')
print('Total time taken to scrape is {} minutes.'.format(round(program_end - program_start)/60, 2))
I referred to a blog on medium for this purpose.

you can add a variable as validator an store it to a file that may be a tweetid.txt
and each time you run the script, you open di tweetid.txt
if tweetid same in tweet id in txt, you pass it.

Extract tweets by specifying latitude and longitude

I'm extracting twitter data by specifying longitude, latitude, and range. However, I want to extract tweets from a rectangular area specifying two coordinate pairs of latitude and longitude.
Code:
from twitter import *
import sys
import csv
latitude = 51.474144 # geographical centre of search
longitude = -0.035401 # geographical centre of search
max_range = 1 # search range in kilometres
num_results = 1000 # minimum results to obtain
outfile = "output.csv"
import sys
sys.path.append(".")
import config
consumer_key = '*************************'
consumer_secret = '*******************************'
access_key = '***************************************'
access_secret = '*****************************'
twitter = Twitter(auth = OAuth(access_key, access_secret, consumer_key,
consumer_secret))
csvfile = open(outfile, "w")
csvwriter = csv.writer(csvfile)
row = [ "user", "text", "latitude", "longitude" ]
csvwriter.writerow(row)
result_count = 0
last_id = None
while result_count < num_results:
query = twitter.search.tweets(q = "", geocode = "%f,%f,%dkm" % (latitude,
longitude, max_range), count = 1000, max_id = last_id)
for result in query["statuses"]:
if result["geo"]:
user = result["user"]["screen_name"]
text = result["text"]
text = text.encode('ascii', 'replace')
latitude = result["geo"]["coordinates"][0]
longitude = result["geo"]["coordinates"][1]
row = [ user, text, latitude, longitude ]
csvwriter.writerow(row)
result_count += 1
last_id = result["id"]
print("got %d results" % result_count)
csvfile.close()
print("written to %s" % outfile)
Any help will be highly appreciated.

from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import time
import json
import re
from urllib3.exceptions import ProtocolError
access_token = 'xxxx'
access_token_secret = 'xxxx'
consumer_key = 'xxxx'
consumer_secret = 'xxxx'
class StdOutListener(StreamListener):
def on_data(self, data):
print (data)
return True
def on_error(self, status):
print ('Encountered error with status code:',status)
if __name__ == '__main__':
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
while True:
try:
stream.filter(locations = [144.9385,-37.8246,144.9761,-37.7955], stall_warnings=True)
except (ProtocolError, AttributeError):
continue

If Twitter only allowed you to search within a circle of specified radius, it appears the only practical way to do this would to compute the centre and diagonal radius of your rectangle, then retrieve those and exclude the ones that lie outside your chosen rectangle (which you can do by verifying each axis individually).
Fortunately they also implement a search by bounding box (see this documentation page).
Studying https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/geo-objects may help you in formulating your request.

Tweepy still not returning full text despite using extended text feature

I am using tweepy to download tweets about a particular topic but nobody which tutorial I follow I cannot get the tweet to output as a full tweet. There is always an ellipse that cuts it off after a certain number of characters.
Here is the code I am using
import json
import tweepy
from tweepy import OAuthHandler
import csv
import sys
from twython import Twython
nonBmpMap = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
with open ('Twitter_Credentials.json') as cred_data:
info = json.load(cred_data)
consumer_Key = info['Consumer_Key']
consumer_Secret = info['Consumer_Secret']
access_Key = info['Access_Key']
access_Secret = info['Access_Secret']
maxTweets = int(input('Enter the Number of tweets that you want to extract '))
userTopic = input('What topic do you want to search for ')
topic = ('"' + userTopic + '"')
tweetCount = 0
auth = OAuthHandler(consumer_Key, consumer_Secret)
auth.set_access_token(access_Key, access_Secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
tweets = api.search(q=topic, count=maxTweets, tweet_mode= 'extended')
for tweet in tweets:
tweetCount = (tweetCount+1)
with open ('TweetsAbout' + userTopic, 'a', encoding='utf-8') as the_File:
print(tweet.full_text.translate(nonBmpMap))
tweet = (str(tweet.full_text).translate(nonBmpMap).replace(',','').replace('|','').replace('\n','').replace('â€™','\'').replace('â€¦',"end"))
the_File.write(tweet + "\n")
print('Extracted ' + str(tweetCount) + ' tweets about ' + topic)

Try this, see if it works!
try:
specific_tweets = tweepy.Cursor(api.search, tweet_mode='extended', q=<your_query_string> +" -filter:retweets", lang='en').items(500)
except tweepy.error.TweepError:
pass
for tweet in specific_tweets:
extracted_text = tweet.full_text
all the text your trying to extract should be in extracted_text. Good Luck!!

How can I python twitter crawling (scraping) several keyword

I wrote the code.
But I don't think it's going to work.
I want to extract words from the concept of " or " rather than the concept of " and ".
It seems like only ' keyword 1 ' is extracted.
How do I make corrections?
import tweepy
import time
import os
search_term = 'keyword1'
search_term2= 'keyword2'
lat = "37.6"
lon = "127.0"
radius = "200km"
location = "%s,%s,%s" % (lat, lon, radius)
API_key = "11111"
API_secret = "22222"
Access_token = "33333"
Access_token_secret = "444"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q=(search_term or search_term2),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(1)
wfile = open(os.getcwd()+"/twtw2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
time.sleep(1)
wfile.close()

Maybe change this line
q=(search_term or search_term2),
to
q="{}+OR+{}".format(search_term,search_term2),
Case matters here for the OR operator
enter q as a string, not as an expression that is short-circuit evaluated
By the way, your credentials (from your post) also work for me.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How can I save crawling(scraping,streaming) result? - python

You may try using pickle import pickle pickle.dump(obj, filename) To load it back result = pickle.load(filename)

Related

why python code does not retrieve Arabic tweets but works with other languages? why does it return random characters?

Tweepy returns same tweets when scraping data repeatedly

Extract tweets by specifying latitude and longitude

Tweepy still not returning full text despite using extended text feature

How can I python twitter crawling (scraping) several keyword

Categories

Resources