How to fix json.decoder.JSONDecodeError when I use googletrans API? - python

I am trying to translate a series of tweet from Italian into English. They are contained in a csv file so I extract them with pandas to compute the sentiment with Vader. Unfortunately, I get this error json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0).
I have tried both to remove the emoji from the tweet and to use a vpn as indicated on some other posts but it doesn't work.
def remove_emoji(text):
return emoji.get_emoji_regexp().sub(u'', text)
def extract_emojis(str):
return ''.join(c for c in str if c in emoji.UNICODE_EMOJI)
def clean_emojis(text):
toreturn = ""
for c in text:
if c not in emoji.UNICODE_EMOJI:
toreturn += c
return toreturn
def sentiment_analyzer_scores(text, engl=True):
if engl:
translation = text
else:
try:
emojis = extract_emojis(text)
text = clean_emojis(text)
demoji.replace(text)
text = remove_emoji(text)
text = text.encode('ascii', 'ignore').decode('ascii')
# translator= Translator(from_lang="Italian",to_lang="English")
# translation = translator.translate(text)
translation = translator.translate(text).text
# print(translation)
except Error as e:
print(text)
print(e)
pass
text = translation + emojis
# print(text)
score = analyser.polarity_scores(text)
return score['compound']
def anl_tweets(lst, engl=True):
sents = []
id = 0
for tweet_text in lst:
try:
sentiment = sentiment_analyzer_scores(tweet_text, engl)
sents.append(sentiment)
id = id + 1
print("Sentiment del tweet n° %s = %s" % (id, sentiment))
except Error as e:
sents.append(0)
print(e)
return sents
#Main
translator = Translator()
analyser = SentimentIntensityAnalyzer()
file_name = 'file.csv'
df = pd.read_csv(file_name)
print(df.shape)
# Calculate Sentiment and add column
df['tweet_sentiment'] = anl_tweets(df.tweet_text, False)
# Save the modifies
df.to_csv(file_name, encoding='utf-8', index=False)

This has nothing to do with emoji, Google has limitations on how many characters you can translate and when you reach that limit, Google API simply blocks you.
Read about the quota here
Simple solution is to break your script into multiple chunks and use proxy server / different IP address.
Another option is https://pypi.org/project/translate/
(I haven't tried it though)

Related

How to remove duplicate tweets in Python?

I am trying to retrieve about 1000 tweets from a search term like 'NFL' using tweepy and storing the tweets into a DataFrame using pandas. My issue is I can't find a way to remove duplicated tweets, I have tried df.drop_duplicates but it only gives me about 100 tweets to work with. Help would be appreciated!
num_needed = 1000
tweet_list = [] # Lists to be added as columns( Tweets, usernames, and screen names) in our dataframe
user_list = []
screen_name_list = []
last_id = -1 # ID of last tweet seen
while len(tweet_list) < num_needed:
try:
new_tweets = api.search(q = 'NFL', count = num_needed, max_id = str(last_id - 1), lang = 'en', tweet_mode = 'extended') # This is the criteria for collecting the tweets that I want. I want to make sure the results are as accurate as possible when making a final analysis.
except tweepy.TweepError as e:
print("Error", e)
break
else:
if not new_tweets:
print("Could not find any more tweets!")
break
else:
for tweet in new_tweets:
# Fetching the screen name and username
screen_name = tweet.author.screen_name
user_name = tweet.author.name
tweet_text = tweet.full_text
tweet_list.append(tweet_text)
user_list.append(user_name)
screen_name_list.append(screen_name)
df = pd.DataFrame() #Create a new dataframe (df) with new columns
df['Screen name'] = screen_name_list
df['Username'] = user_list
df['Tweets'] = tweet_list
Well, yes, when you use .drop_duplicates(), you only get 100 tweets because that's how many duplicates there are. Doesn't matter what technique you use here, there are 900 or so duplicates with how your code runs.
So you might be asking, why? It by default returns only 100 tweets, which I am assuming you are aware of since you are looping and you try to get more by using the max_id parameter. However, your max_id, is always -1 here, you never get the id and thus never change that parameter. So one thing you can do, is while you iterate through the tweets, also collect the ids. Then after you get all the ids, store the minimum id value as last_id, then it'll work in your loop:
Code:
num_needed = 1000
tweet_list = [] # Lists to be added as columns( Tweets, usernames, and screen names) in our dataframe
user_list = []
screen_name_list = []
tw_id = [] #<-- ADDED THIS
last_id = -1 # ID of last tweet seen
while len(tweet_list) < num_needed:
try:
new_tweets = api.search(q = 'NFL -filter:retweets', count = num_needed, max_id = str(last_id - 1), lang = 'en', tweet_mode = 'extended') # This is the criteria for collecting the tweets that I want. I want to make sure the results are as accurate as possible when making a final analysis.
except tweepy.TweepError as e:
print("Error", e)
break
else:
if not new_tweets:
print("Could not find any more tweets!")
break
else:
for tweet in new_tweets:
# Fetching the screen name and username
screen_name = tweet.author.screen_name
user_name = tweet.author.name
tweet_text = tweet.full_text
tweet_list.append(tweet_text)
user_list.append(user_name)
screen_name_list.append(screen_name)
tw_id.append(tweet.id) #<-- ADDED THIS
last_id = min(tw_id) #<-- ADDED THIS
df = pd.DataFrame({'Screen name':screen_name_list,
'Username':user_list,
'Tweets':tweet_list})
df = df.drop_duplicates()
This returns to me aprox 1000 tweets.
Output:
print (len(df))
1084

why python code does not retrieve Arabic tweets but works with other languages? why does it return random characters?

my code is in python3 and I used it before to live stream tweets in English. However the same code when searching for an Arabic query, it returns all tweets in symbols and random characters.here is a screenshot, and the code. (ps: I am a beginner in coding) (thank you!) here is my code:
import twitter,json,csv
CONSUMER_KEY = '<consumer key>'
CONSUMER_SECRET = '<consumer secret>'
OAUTH_TOKEN = '<oauth token>'
OAUTH_TOKEN_SECRET = '<oauth token secret>'
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
# setup a file to write to
csvfile = open('tweets_extended.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter='|')
# heres a function that takes out characters that can break
# our import into Excel and replaces them with spaces
# it also does the unicode bit
def getVal(val):
clean = ""
if val:
val = val.replace('|', ' ')
val = val.replace('\n', ' ')
val = val.replace('\r', ' ')
clean = val.encode('utf-8')
return clean
q = "سلمان" # Comma-separated list of terms can go here
print ('Filtering the public timeline for track="%s"' % (q,))
twitter_stream = twitter.TwitterStream(auth=twitter_api.auth)
stream = twitter_stream.statuses.filter(track=q)
for tweet in stream:
try:
if tweet['truncated']:
tweet_text = tweet['extended_tweet']['full_text']
else:
tweet_text = tweet['text']
# write the values to file
csvwriter.writerow([
tweet['created_at'],
getVal(tweet['user']['screen_name']),
getVal(tweet_text),
getVal(tweet['user']['location']),
tweet['user']['statuses_count'],
tweet['user']['followers_count'],
tweet['user']['lang'],
tweet['user']['id'],
])
# print something to the screen, mostly so we can see what is going on...
print (tweet['user']['screen_name'].encode('utf-8'), tweet['text'].encode('utf-8'))
except Exception as err:
print (err)
pass

Error while creating chatbot using python (Row data)

So I am creating a chat bot inspired off the tutorials from sentdex yet I ran into an error I can not figure out.
I am using the latest version python
Code for the chat bot:
<i>
import sqlite3
import json
from datetime import datetime
timeframe = '2007-02'
sql_transaction = []
connection = sqlite3.connect('{}.db' .format(timeframe))
c = connection.cursor()
def create_table():
c.execute("""CREATE TABLE IF NOT EXISTS parent_reply
(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT,
comment TEXT, subreddit TEXT, unix INT, score INT)""")
def format_data(date):
data = data.replace("\n"," newlinechar ").replace("\r"," newlinechar
").replace('"',"'")
return data
def find_parent(pid):
try:
sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}'
LIMIT 1".format(pid)
c.execture(sql)
result = c.fetchone()
if result != None:
return result [0]
else: return False
except Exception as e:
#print ("find_parent", e)
return False
if __name__ == "__main__":
create_table()
row_counter = 0
paired_rows = 0
with open("/home/anonymouz/Desktop/redditdata/{}/RC_{}".format(timeframe.split('-')[0], timeframe ), buffering=1000) as f:
for row in f:
print(row)
row_counter += 1
row = json.loads(row)
parent_id = row['parent_id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)<i>
And the error I am getting:
Traceback (most recent call last):
File "/home/anonymouz/Desktop/redditdata/reddit.py", line 44, in <module>
body = format_data(row['body'])
File "/home/anonymouz/Desktop/redditdata/reddit.py", line 17, in format_data
data = data.replace("\n"," newlinechar ").replace("\r"," newlinechar ").replace('"',"'")
UnboundLocalError: local variable 'data' referenced before assignment
>>>
Thank you for anyone who is able to help and isn't rude about it :)
More clean version of code with correct indents:
https://pastebin.com/2ifpEQy9
def format_data(date):
Your parameter is 'date' but your local is 'data'.
Change your parameter name to 'data'
def format_data(data):
data = data.replace("\n"," newlinechar ").replace("\r"," newlinechar
").replace('"',"'")
return data

Trouble opening up a twitter text file in python

I gathered a bunch of tweets for analysis with python. But upon trying to open up the text extension file I received this error message. I don't know if maybe something is wrong the schema of the tweets that I collected.
JSONDecodeError: Extra data: line 2 column 1 (char 12025)
Here is the code that I compiled:
with open ('tweets1.json') as dakota_file:
dakota_j=json.loads(dakota_file.read())
Please see code:
import sys
import jsonpickle
import os
searchQuery = '#Dakota-Access-Pipeline' # this is what we're searching for
#maxTweets = 10000000 # Some arbitrary large number
maxTweets=6000
tweetsPerQry = 100 # this is the max the API permits
#fName = 'tweets.txt' # We'll store the tweets in a text file.
fName='tweets.json'
# If results from a specific ID onwards are reqd, set since_id to that ID.
# else default to no lower limit, go as far back as API allows
sinceId = None
# If results only below a specific ID are, set max_id to that ID.
# else default to no upper limit, start from the most recent tweet matching the search query.
max_id = -10000000
tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
with open(fName, 'w') as f:
while tweetCount < maxTweets:
try:
if (max_id <= 0):
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry)
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
since_id=sinceId)
else:
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
max_id=str(max_id - 1))
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
max_id=str(max_id - 1),
since_id=sinceId)
if not new_tweets:
print("No more tweets found")
break
for tweet in new_tweets:
f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
'\n')
tweetCount += len(new_tweets)
print("Downloaded {0} tweets".format(tweetCount))
max_id = new_tweets[-1].id
except tweepy.TweepError as e:
# Just exit if any error
print("some error : " + str(e))
break
print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))

get full text from pubmed

I am using the python api Bio to access to the pubmed central database but unfortunately I can get only the abstract from this api
I want to know if it's possible to get the full text and how
molp5 is a file containing a listing of molecules like below
Flavopiridol
4-azapaullone
here is my code:
def search(query):
Entrez.email = 'xxxxx#gmail.com'
handle = Entrez.esearch(db='pubmed',
sort='relevance',
retmax='3000',
retmode='text',
rettype='Medline',
term=query)
results = Entrez.read(handle)
return results
def fetch_details(id_list):
ids = ','.join(id_list)
Entrez.email = 'xxxxx#gmail.com'
handle = Entrez.efetch(db='pubmed',
retmode='xml',
id=ids)
results = Entrez.read(handle)
return results
if __name__ == '__main__':
#load the file containing the name of the molecules
mol = pd.read_csv('/W2V/molp5.csv')
mol["idx"] = mol["idx"].apply(lambda x:lower(x))
txt = ""
retmax = []
for m in mol["idx"]:
results = search(m)
#print the number of article available and the name of the molecule
print m, results['RetMax']
id_list = results['IdList']
papers = fetch_details(id_list)
for i, paper in enumerate(papers):
try:
#concatenate the abstract together
txt += paper['MedlineCitation']['Article']['ArticleTitle']
for j in paper['MedlineCitation']['Article']['Abstract']['AbstractText']:
txt += j+'\n'
except KeyError:
pass

Categories

Resources