I scraped twitter data and tried to translate the whole dataset (around 30.000) with deeptranslator from googletrans in Python. To overcome the api limit, I subdivided the dataset in smaller batches. But there is still a problem, probably with the nan values in user_description? Could someone help out?
batch_1 = no_eng_data.iloc[0:1000]
batch_2 = no_eng_data.iloc[1000:2000]
def batch_translation(batch):
data_list = []
for index, row in batch.iterrows():
if row["user_description"] != row["user_description"]:
text = ""
else:
text = row["user_description"]
#text = row['user_description']
translated = GoogleTranslator(source='auto', target='en').translate(text=text)
data_list.append(translated)
return data_list
data_list_1 = batch_translation(batch_1)
with open('data_list_1.pkl', 'wb') as f:
pickle.dump(data_list_1, f)
time.sleep(300)
error: NotValidPayload: --> text must be a valid text with maximum 5000 character,
otherwise it cannot be translated ยดยดยด
Related
I am using function to count occurrences of given word in pdf using PyPDF2. While the function is running I get message in terminal:
FloatObject (b'0.000000000000-14210855') invalid; use 0.0 instead
My code:
def count_words(word):
print()
print('Counting words..')
files = os.listdir('./pdfs')
counted_words = []
for idx, file in enumerate(files, 1):
with open(f'./pdfs/{file}', 'rb') as pdf_file:
ReadPDF = PyPDF2.PdfFileReader(pdf_file, strict=False)
pages = ReadPDF.numPages
words_count = 0
for page in range(pages):
pageObj = ReadPDF.getPage(page)
data = pageObj.extract_text()
words_count += sum(1 for match in re.findall(rf'\b{word}\b', data, flags=re.I))
counted_words.append(words_count)
print(f'File: {idx}')
return counted_words
How to get rid of this message?
See https://pypdf2.readthedocs.io/en/latest/user/suppress-warnings.html
import logging
logger = logging.getLogger("PyPDF2")
logger.setLevel(logging.ERROR)
The PDF specification has never allowed scientific (exponent/mantissa) floats, which yours looks a little bit like. An unscrupulous PDF producer has output, therefore, a malformed PDF file. PyPDF's choice to convert it to 0.0 seems a solid response.
I'm in need of some advice with my Twitter sentiment analysis.
I'm trying to do a pretty common sentiment analysis, but not on random tweets from Twitter search, but on the tweets of selected users.
What I've tried so far, is that I read in this csv of the users. And then iterated over this list and then, user by user conducted this tweet analysis.
I'll put my write_tweets function here, just so it could get some feedback maybe :)
def write_tweets(users_df, file):
# If the file exists, then read the existing data from the CSV file.
if os.path.exists(file):
df = pd.read_csv(file, header=0)
else:
df = pd.DataFrame(columns=COLS)
#page attribute in tweepy.cursor and iteration
for user in users_df[0]:
#for user in users_list:
print(user)
#user = 'fion_li'
try:
#for status in tweepy.Cursor(api.user_timeline, screen_name=user, count = 1,tweet_mode="extended").items(22):
for status in tweepy.Cursor(api.user_timeline, screen_name=user, count = 1,tweet_mode="extended").items(1):
#print(status)
new_entry = []
status = status._json
#print(to_datetime(status['created_at']))
#print(status['full_text'])
#csvFile = open(file, 'a' ,encoding='utf-8')
if (to_datetime(status['created_at']) < startDate):
#print(to_datetime(status['created_at']))
#print(status['full_text'])
continue
## check whether the tweet is in english or skip to the next tweet
if status['lang'] != 'en':
continue
#tweepy preprocessing called for basic preprocessing
#clean_text = clean(status['entities'])
clean_text = clean(status['full_text'])
#call clean_tweet method for extra preprocessing
filtered_tweet=clean_tweets(clean_text)
#pass textBlob method for sentiment calculations
blob = TextBlob(filtered_tweet)
blob_2 = TextBlob(filtered_tweet, analyzer=NaiveBayesAnalyzer())
Sentiment = blob.sentiment
Sentiment_2 = blob_2.sentiment
#seperate polarity and subjectivity in to two variables
polarity = Sentiment.polarity
subjectivity = Sentiment.subjectivity
positivity = Sentiment_2.p_pos
negativity = Sentiment_2.p_neg
#new entry append
new_entry += [status['id'], status['created_at'],
status['source'],
#status['full_text'],
filtered_tweet, Sentiment,polarity,subjectivity, positivity, negativity, status['lang'],
status['favorite_count'], status['retweet_count']]
#to append original author of the tweet
new_entry.append(status['user']['screen_name'])
try:
is_sensitive = status['possibly_sensitive']
except KeyError:
is_sensitive = None
new_entry.append(is_sensitive)
# hashtagas and mentiones are saved using comma separted
hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']])
new_entry.append(hashtags)
mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']])
new_entry.append(mentions)
#get location of the tweet if possible
try:
location = status['user']['location']
except TypeError:
location = ''
new_entry.append(location)
try:
coordinates = [coord for loc in status['place']['bounding_box']['coordinates'] for coord in loc]
except TypeError:
coordinates = None
new_entry.append(coordinates)
single_tweet_df = pd.DataFrame([new_entry], columns=COLS)
#print(single_tweet_df)
df = df.append(single_tweet_df, ignore_index=True)
csvFile = open(file, 'a' ,encoding='utf-8')
except Exception, e:
pass
#csvFile = open(file, 'a' ,encoding='utf-8')
df.to_csv(csvFile, mode='a', columns=COLS, index=False, encoding="utf-8")
write_tweets(users_list, test_file)
Output would be a few indicators of sentiment, like positivity, negativity, neutral etc.
My question is, that maybe some of you has done this kind of thing already and can give me some recommendations about it? My version of it seems very slow and not very efficient (for me, at least).
Thanks in advance
I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).
Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.
most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.
I am trying to get data from a website pdf with python.
The website is
https://wgfd.wyo.gov/WGFD/media/content/PDF/Hunting/Drawing%20Odds/Draw_Elk_RD_Res_2010.pdf
My code imports it all into a single string and removes the header information successfully. I am trying to build a Pandas data frame but some unusual/hidden characters are causing problems so I cant just split it into a list. I have figured out that there are 131 characters in a the first few rows but then a hidden control character disappears resulting in the row not being parsed correctly. I have tried removing the standard /n, /r, /t, and even /v with no luck. The mysterious character apparently spaces down two rows.
pdfFileObj = open(filename,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
text=''
#create string
for i in range(num_pages):
pageObj = pdfReader.getPage(i)
textplus = pageObj.extractText()
firsthunt = re.findall(r'(?<!\d)\d{3}(?!\d)',textplus)
numtoskip = textplus.find(str(firsthunt[0]))
text += textplus[numtoskip:]
#replace charaters
text = text.replace(' ','|')
text = text.replace('/r','!')
text = text.replace('/n','#')
text = text.replace('/t','^')
text = text.replace('/v','*')
# del file
pdfFileObj.__del__()
os.remove(filename)
rowlength = 131
for i in range(0,10):
print(text[i*rowlength:(1+i)*rowlength])
print('New Row')
Hope this makes some sense.
Thanks
I'm a newbie to Python and trying to build a program that will allow me to parse several hundred documents by speaker and their speech (data is hearing transcripts of a semi-regular structure). After parsing, I write the results into a .csv file, then write another file that parses speech into paragraphs and makes another .csv. Here is the code (Acknowledgements to my colleague on his part in development of this, which was massive):
import os
import re
import csv
from bs4 import BeautifulSoup
path = "path in computer"
os.chdir(path)
with open('hearing_name.htm', 'r') as f:
hearing = f.read()
Hearing = BeautifulSoup(hearing)
Hearing = Hearing.get_text()
Hearing = Hearing.split("erroneous text")
speakers = re.findall("\\n Mr. [A-Z][a-z]+\.|\\n Ms. [A-Z][a-z]+\.|\\n Congressman [A-Z][a-z]+\.|\\n Congresswoman [A-Z][a-z]+\.|\\n Chairwoman [A-Z][a-z]+\.|\\n Chairman [A-Z][a-z]+\.", hearing)
speakers = list(set(speakers))
print speakers
position = []
for speaker in speakers:
x = hearing.find(speaker)
position.append(x)
def find_speaker(hearing, speakers):
position = []
for speaker in speakers:
x = hearing.find(speaker)
if x==-1:
x += 1000000
position.append(x)
first = min(position)
name = speakers[position.index(min(position))]
name_length = len(name)
chunk = [name, hearing[0:first], hearing[first+name_length:]]
return chunk
chunks = []
print hearing
names = []
while len(hearing)>10:
chunk_try = find_speaker(hearing, speakers)
hearing = chunk_try[2]
chunks.append(chunk_try[1])
names.append(chunk_try[0].strip())
print len(hearing)#0
#print dialogue[0:5]
chunks.append(hearing)
chunks = chunks[1:]
print len(names) #138
print len(chunks) #138
data = zip(names, chunks)
with open('filename.csv','wb') as f:
w=csv.writer(f)
w.writerow(['Speaker','Speach'])
for row in data:
w.writerow(row)
paragraphs = str(chunks)
print (paragraphs)
Paragraphs = paragraphs.split("\\n")
data1 = zip(Paragraphs)
with open('Paragraphs.csv','wb') as f:
w=csv.writer(f)
w.writerow(['Paragraphs'])
for row in data1:
w.writerow(row)
Obviously, the code above can do what I need one hearing at a time, but my question is
how can I automate this to the point were I can either do large batches or all of the files at once (578 hearings in total)? I've tried the below (which has worked for me in the past when compiling large sets of data), but this time I get no results (memory leak?)
Tested Compiling Code:
hearing = [filename for filename in os.listdir(path)]
hearings = []
#compile hearings
for file in hearing:
input = open(file, 'r')
hearings.append(input.read())
Thanks in advance for your help.
First you need to take the first set of code, generalize it and make it into a giant function. The will involve replacing any hardcoded path and file names in it with variables named appropriately.
Give the new driver function argments that correspond to each of the path(s) and file name(s) you replaced. Calling this function will preform all the steps need to process one input file and produce all the output files that result from doing that.
You can test whether you've done this correctly by calling the driver function and passing it the file names that it used to be hardcoded and see if it produces the same output as it did before.
Once that is done, import the file the function is in (which is now called a module) into your batch processing script and invoke the new driver function you added multiple times, passing different input and output file names to it each time.
I've done the first step for you (and fixed the mixed indenting). Note however that it's untested since that's impossible for me to actually do:
import os
import re
import csv
from bs4 import BeautifulSoup
def driver(folder, input_filename, output_filename1, output_filename2):
os.chdir(folder)
with open(input_filename, 'r') as f:
hearing = f.read()
Hearing = BeautifulSoup(hearing)
Hearing = Hearing.get_text()
Hearing = Hearing.split("erroneous text")
speakers = re.findall("\\n Mr. [A-Z][a-z]+\.|\\n Ms. [A-Z][a-z]+\.|\\n Congressman [A-Z][a-z]+\.|\\n Congresswoman [A-Z][a-z]+\.|\\n Chairwoman [A-Z][a-z]+\.|\\n Chairman [A-Z][a-z]+\.", hearing)
speakers = list(set(speakers))
print speakers
position = []
for speaker in speakers:
x = hearing.find(speaker)
position.append(x)
def find_speaker(hearing, speakers):
position = []
for speaker in speakers:
x = hearing.find(speaker)
if x==-1:
x += 1000000
position.append(x)
first = min(position)
name = speakers[position.index(min(position))]
name_length = len(name)
chunk = [name, hearing[0:first], hearing[first+name_length:]]
return chunk
chunks = []
print hearing
names = []
while len(hearing)>10:
chunk_try = find_speaker(hearing, speakers)
hearing = chunk_try[2]
chunks.append(chunk_try[1])
names.append(chunk_try[0].strip())
print len(hearing)#0
#print dialogue[0:5]
chunks.append(hearing)
chunks = chunks[1:]
print len(names) #138
print len(chunks) #138
data = zip(names, chunks)
with open(output_filename1,'wb') as f:
w=csv.writer(f)
w.writerow(['Speaker','Speach'])
for row in data:
w.writerow(row)
paragraphs = str(chunks)
print (paragraphs)
Paragraphs = paragraphs.split("\\n")
data1 = zip(Paragraphs)
with open(output_filename2,'wb') as f:
w=csv.writer(f)
w.writerow(['Paragraphs'])
for row in data1:
w.writerow(row)
return True # success
if __name__ == '__main__':
driver('path in computer', 'hearing_name.htm', 'filename.csv', 'Paragraphs.csv')
You can use a lot less memory with no real down-side if you process these files individually. Rather than read the whole file then add that to a list for future processing, process a file, then move on to the next.
As for the no results, I'm not totally sure. Are you not getting any errors?