I can't get the full text of the Arabic tweet when I use tweet_mode='extended' and tweet.full_text.encode('utf-8') I don't know what is the problem
HashValue = "#لقاح"
StartDate = "2022-01-02"
csvFile = open(HashValue+'.csv', 'a',encoding='utf-8')
csvWriter = csv.writer(csvFile)
for tweet in tweepy.Cursor(api.search,
q=HashValue,
count=20,
lang="ar",
since=StartDate,
tweet_mode='extended').items():
print (tweet.created_at, tweet.full_text)
csvWriter.writerow([tweet.created_at, tweet.full_text.encode('utf-8')])
print ("Scraping finished and saved to "+HashValue+".csv")
Related
I'm trying to do a social data science research paper and all I'm trying to do right now is download some tweets and put them into a CSV file. Every time I do this though, the script executes but the CSV file is empty when opened:
import tweepy
import csv
consumer_key=""
consumer_secret=""
access_token=""
access_token_secret=""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
search_words = "leftists -filter:retweets"
date_since = "2021-01-18"
# how does this algorithm determine what tweets are printed?
tweets = tweepy.Cursor(api.search, q=search_words, lang="en",since=date_since).items(100)
# open and create a file to append the data to
tweets = str(tweets)
csvFile = open(tweets+'.csv', 'a')
# use the csv file
# loop through the tweets variable and add them to the CSV file
with open('tweets.csv', 'w') as csvfile:
csvWriter = csv.writer(csvFile)
for tweet in tweepy.Cursor(api.search,q=tweets,count=100,lang="en",since=date_since, tweet_mode='extended').items(10):
print (tweet.created_at, tweet.full_text)
csvWriter.writerow([tweet.created_at, tweet.full_text.encode('utf-8')])
csvFile.close()
print ("Scraping finished and saved to "+tweets+".csv")
for tweet in tweets:
print(tweet.text)
You can try to use this different way:
import xlwt
book = xlwt.Workbook(encoding="utf-8")
sheet1 = book.add_sheet("Sheet 1")
counter = 0
for tweet in tweepy.Cursor(api.search,q=tweets,count=100,lang="en",since=date_since, tweet_mode='extended').items(10):
sheet1.write(counter, 0, str(tweet.created_at, tweet.full_text.encode('utf-8')))
counter += 1
book.save("trial.csv")
Think I'm overlooking the answer so need a fresh pair of eyes. My script should search and get tweets, then write the tweet date, username, and tweet text on one row, separated by columns. Then write the next matching tweet to a new row and so on. Printing the returned twitter object values confirms all ok. Can print and separate the data for each tweet. However, when writing to Excel, my loop code just writes the first tweet n times, without the remaining tweets.
Code:
print ('TEST PRINT...')
for tweet in tweepy.Cursor(api.search, search).items(numberOfTweets):
print(tweet.created_at)
print(tweet.user.screen_name)
print(tweet.text)
print '\n'
for tweet in tweepy.Cursor(api.search, search).items(numberOfTweets):
for rowNum in range(3, sheet.max_row):
sheet.cell(row=rowNum, column=1).value = tweet.created_at
sheet.cell(row=rowNum, column=2).value = tweet.user.screen_name
sheet.cell(row=rowNum, column=3).value = tweet.text
break
The second code block is the issue. How can I write the three above tweet values for each tweet on separate rows?
Thanks in advance...
Yes you are looping with the same tweet. Try this (i could't test) :
rowNum = 0 # or 3 ?
for tweet in tweepy.Cursor(api.search, search).items(numberOfTweets):
sheet.cell(row=rowNum, column=1).value = tweet.created_at
sheet.cell(row=rowNum, column=2).value = tweet.user.screen_name
sheet.cell(row=rowNum, column=3).value = tweet.text
rowNum = rowNum + 1
Ive got a python script intending to scrape tweets from twitter and append them to a csv file. Im using the tweepy module however it is only returning 1 tweet. Is this a problem with my for loop, or with the call to the twitter API?
for status in tweepy.Cursor(twitterapi.search,q="labour party",since="2018-05-01", until="2018-05-10").items(200):
if 'RT' not in status.text:
with open('C:/Users/User/Desktop/twittersentiment.csv', 'wb') as f:
w = csv.writer(f)
favourites = status.user.favourites_count
location = status.user.location.encode('utf8')
tweet_text = ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",status.text.encode('utf8')).split())
date = status.created_at.strftime('%m/%d/%Y')
a = [location]
b=[favourites]
c=[tweet_text]
d=[date]
zip(a,b,c,d)
w.writerow(zip(a,b,c,d))
You should open the file before you start iterating the tweepy.Cursor otherwise each iteration of the cursor will create a new file with one entry, overwriting the previous file.
with open('C:/Users/User/Desktop/twittersentiment.csv', 'wb') as f:
w = csv.writer(f)
for status in tweepy.Cursor(twitterapi.search,q="labour party",since="2018-05-01", until="2018-05-10").items(200):
if 'RT' not in status.text:
favourites = status.user.favourites_count
location = status.user.location.encode('utf8')
tweet_text = ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",status.text.encode('utf8')).split())
date = status.created_at.strftime('%m/%d/%Y')
a = [location]
b=[favourites]
c=[tweet_text]
d=[date]
zip(a,b,c,d)
w.writerow(zip(a,b,c,d))
i am collecting twitter data with python tweepy here is code
class listener (StreamListener):
def on_data(self, raw_data):
data = json.loads(raw_data)
print data.keys()
tweet = data['text'].encode("utf-8")
tweet_id = data['id']
time_tweet = data['timestamp_ms']
date = datetime.datetime.fromtimestamp(int(time_tweet) / 1000)
new_date = str(date).split(" ") [0]
print new_date
user_id = data['user']['id']
with open('twitDB.csv','ab') as csvfile:
myfile = csv.writer(csvfile)
myfile.writerow([tweet_id,new_date,tweet,user_id])
return True
def on_error(self, status_code):
print status_code
auth = OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_token,access_token_secret)
twitterStream = Stream(auth,listener())
twitterStream.filter(track=["car"])
But in Csv File for tweet_id and user_id are in 8.85132E+17 format how i resolve this ?
Place a tab character in front of tweet_id
tweet_id = '\t' + data['id']
I am doing preprocessing tweet in Python. My unpreprocess tweets are in a folder. Each file containing unpreprocess tweet named 1.txt, 2.txt,...10000.txt. I want to preprocess them and write them into new files that also named 1.txt , 2.txt,...10000.txt.
My code is as follows :
for filename in glob.glob(os.path.join(path, '*.txt')):
with open(filename) as file:
tweet=file.read()
def processTweet(tweet):
tweet = tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
tweet = re.sub('#[^\s]+','USER',tweet)
tweet = re.sub('[\s]+', ' ', tweet)
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
tweet = tweet.translate(None, string.punctuation)
tweet = tweet.strip('\'"')
return tweet
fp = open(filename)
line = fp.readline()
count = 0
processedTweet = processTweet(line)
line = fp.readline()
count += 1
name = str(count) + ".txt"
file = open(name, "w")
file.write(processedTweet)
file.close()
But that code just give me a new file named 1.txt that already preprocessed. How can I write the other 9999 files? Is there any mistake in my code?
Your count is getting reset to 0 with the call to count=0. So everytime it is about to write a file, it write "1.txt". Why are you trying to reconstruct the filename, instead of just using the existing filename for the tweet you are preprocessing. Also, you should move your function definition to outside the loop:
def processTweet(tweet):
tweet = tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
tweet = re.sub('#[^\s]+','USER',tweet)
tweet = re.sub('[\s]+', ' ', tweet)
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
tweet = tweet.translate(None, string.punctuation)
tweet = tweet.strip('\'"')
return tweet
for filename in glob.glob(os.path.join(path, '*.txt')):
with open(filename) as file:
tweet=file.read()
processedTweet = processTweet(tweet)
file = open(filename, "w")
file.write(processedTweet)
file.close()