Tokenizing tweets in Python - python

enneg3clear.txt is a file with Tweets without punctuation and stopwords on every line.
import re, string
import sys
#this code tokenizes
input_file = 'enneg3clear.txt'
with open(input_file) as f:
lines = f.readlines()
results = []
texts = []
for line in lines:
texts = ([word for word in line.lower().split()])
results.append(texts)
print results
[['\xef\xbb\xbfmy', 'good', 'sis', 'kelly', 'bouta', 'compete', 'with', 'adele', 'that', 'over', 'weinvm'], ['going', 'miss', 'japppaaannnnn'], ['its', 'so', 'hard', 'get', 'out', 'bed', 'morning', 'vote5sos', 'kca'
], ['police', 'fatally', 'shoot', 'homeless', 'man', 'losangeles', 'gtgt'], ['my', 'trumpet', 'has', 'been', 'idle', 'days', 'now'], ['mercenaries', 'was', 'game', 'i', 'lent', 'friend', 'never', 'saw', 'again'], ['
yeah', 'i', 'miss', 'you', 'all', 'so', 'much', 'already'], ['acabou', 'talitaaraujonomaisvoce'], ['im', 'at', 'strain', 'station', 'waiting', 'train', 'arrive', 'sigh', 'im', 'sooo', 'tired']]
#remove words that appear only once
all_tokens = sum(results, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
print tokens_once
set(['all', 'already', 'tired', 'sigh', 'over', 'hard', 'sooo', 'yeah', 'strain', '\xef\xbb\xbfmy', 'japppaaannnnn', 'adele', 'at', 'homeless', 'trumpet', 'its', 'out', 'sis', 'again', 'police', 'vote5sos', 'gtgt',
'saw', 'that', 'idle', 'been', 'mercenaries', 'waiting', 'station', 'you', 'has', 'was', 'friend', 'losangeles', 'kca', 'get', 'never', 'much', 'game', 'train', 'lent', 'now', 'with', 'bouta', 'man', 'shoot', 'going
', 'talitaaraujonomaisvoce', 'fatally', 'days', 'bed', 'morning', 'weinvm', 'good', 'compete', 'acabou', 'kelly', 'arrive', 'my'])
results = [[word for word in results if word not in tokens_once]]
print (results)
File "atokenize.py", line 25, in <module>
results = [[word for word in results if word not in tokens_once]]
TypeError: unhashable type: 'list'
So the error can be found in the second line started from the bottom. Any idea how to solve this?

your results contain list of lists.So you have to flatten it.
So simply put
results = [j for i in results for j in i]
above the line
results = [[word for word in results if word not in tokens_once]]
OR another solution
change append to extend
for line in lines:
texts = ([word for word in line.lower().split()])
results.extend(texts) # or results += texts

Related

Python MRJob Script Sorting Results - Top Ten Words Syllable Count

I am trying to make a job that takes in a text file, only processes words that are not in the STOPWORDS set, counts the number of syllables in each word, then returns the top 10 words with the most syllables, sorting the results.
I believe everything is correct, I just am not sure how to make the reducer sort the results.
Here is my code:
%%file top_10_syllable_count.py
import re
from sys import stderr
from mrjob.job import MRJob
from mrjob.step import MRStep
WORD_RE = re.compile(r"[\w']+")
import syllables
def splitter(text):
WORD_RE = re.compile(r"[\w']+")
return WORD_RE.findall(text)
def sort_results(results):
"""
Sorts a list of 2-tuples descending by the first value in the
tuple, ascending by the second value in the tuple.
"""
return sorted(results, key=lambda k: (-k[0], k[1]))
STOPWORDS = {
'i', 'we', 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during',
'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such',
'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each',
'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me',
'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up',
'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been',
'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so',
'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself',
'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by',
'doing', 'it', 'how', 'further', 'was', 'here', 'than'
}
class MRMostTenSyllables(MRJob):
def steps(self):
return[
MRStep(mapper=self.mapper_get_words),
MRStep(reducer=self.reducer_find_max_word)
]
def mapper_get_words(self, _, line):
for word in WORD_RE.findall(line):
if word.lower() not in STOPWORDS:
syllable_count = syllables.estimate(word)
yield None, (syllable_count, word.lower())
def reducer_find_max_word(self, key, values):
self.list = []
for value in values:
self.list.append(value)
self.new = []
for i in range(10):
self.new.append(max(self.list))
self.list.remove(max(self.list))
for i in range(10):
yield self.new[i]
if __name__ == '__main__':
import time
start = time.time()
MRMostTenSyllables.run()
end = time.time()
debug("Run time:", end - start, "seconds")

Write a function which removes english stop words from a tweet

I want to write a function that removes English stop words from a tweet.
Function Specifications:
It should take a pandas dataframe as input.
Should tokenise the sentences according to the definition in function 6. Note that function 6 cannot be called within this function.
Should remove all stop words in the tokenised list. The stopwords are defined in the stop_words_dict variable defined at the top of this notebook.
The resulting tokenised list should be placed in a column named "Without Stop Words".
The function should modify the input dataframe.
The function should return the modified dataframe.
Here is the twitter dataframe:
twitter_url = 'https://raw.githubusercontent.com/Explore-AI/Public-Data/master/Data/twitter_nov_2019.csv'
twitter_df = pd.read_csv(twitter_url)
twitter_df.head()
Here are the 'stop_words' in a dictionary:
stop_words_dict = {
'stopwords':[
'where', 'done', 'if', 'before', 'll', 'very', 'keep', 'something', 'nothing', 'thereupon',
'may', 'why', '’s', 'therefore', 'you', 'with', 'towards', 'make', 'really', 'few', 'former',
'during', 'mine', 'do', 'would', 'of', 'off', 'six', 'yourself', 'becoming', 'through',
'seeming', 'hence', 'us', 'anywhere', 'regarding', 'whole', 'down', 'seem', 'whereas', 'to',
'their', 'various', 'thereafter', '‘d', 'above', 'put', 'sometime', 'moreover', 'whoever', 'although',
'at', 'four', 'each', 'among', 'whatever', 'any', 'anyhow', 'herein', 'become', 'last', 'between', 'still',
'was', 'almost', 'twelve', 'used', 'who', 'go', 'not', 'enough', 'well', '’ve', 'might', 'see', 'whose',
'everywhere', 'yourselves', 'across', 'myself', 'further', 'did', 'then', 'is', 'except', 'up', 'take',
'became', 'however', 'many', 'thence', 'onto', '‘m', 'my', 'own', 'must', 'wherein', 'elsewhere', 'behind',
'becomes', 'alone', 'due', 'being', 'neither', 'a', 'over', 'beside', 'fifteen', 'meanwhile', 'upon', 'next',
'forty', 'what', 'less', 'and', 'please', 'toward', 'about', 'below', 'hereafter', 'whether', 'yet', 'nor',
'against', 'whereupon', 'top', 'first', 'three', 'show', 'per', 'five', 'two', 'ourselves', 'whenever',
'get', 'thereby', 'noone', 'had', 'now', 'everyone', 'everything', 'nowhere', 'ca', 'though', 'least',
'so', 'both', 'otherwise', 'whereby', 'unless', 'somewhere', 'give', 'formerly', '’d', 'under',
'while', 'empty', 'doing', 'besides', 'thus', 'this', 'anyone', 'its', 'after', 'bottom', 'call',
'n’t', 'name', 'even', 'eleven', 'by', 'from', 'when', 'or', 'anyway', 'how', 'the', 'all',
'much', 'another', 'since', 'hundred', 'serious', '‘ve', 'ever', 'out', 'full', 'themselves',
'been', 'in', "'d", 'wherever', 'part', 'someone', 'therein', 'can', 'seemed', 'hereby', 'others',
"'s", "'re", 'most', 'one', "n't", 'into', 'some', 'will', 'these', 'twenty', 'here', 'as', 'nobody',
'also', 'along', 'than', 'anything', 'he', 'there', 'does', 'we', '’ll', 'latterly', 'are', 'ten',
'hers', 'should', 'they', '‘s', 'either', 'am', 'be', 'perhaps', '’re', 'only', 'namely', 'sixty',
'made', "'m", 'always', 'those', 'have', 'again', 'her', 'once', 'ours', 'herself', 'else', 'has', 'nine',
'more', 'sometimes', 'your', 'yours', 'that', 'around', 'his', 'indeed', 'mostly', 'cannot', '‘ll', 'too',
'seems', '’m', 'himself', 'latter', 'whither', 'amount', 'other', 'nevertheless', 'whom', 'for', 'somehow',
'beforehand', 'just', 'an', 'beyond', 'amongst', 'none', "'ve", 'say', 'via', 'but', 'often', 're', 'our',
'because', 'rather', 'using', 'without', 'throughout', 'on', 'she', 'never', 'eight', 'no', 'hereupon',
'them', 'whereafter', 'quite', 'which', 'move', 'thru', 'until', 'afterwards', 'fifty', 'i', 'itself', 'n‘t',
'him', 'could', 'front', 'within', '‘re', 'back', 'such', 'already', 'several', 'side', 'whence', 'me',
'same', 'were', 'it', 'every', 'third', 'together'
]
}
Here is the code I have tried writing:
def stop_words_remover(df):
df['With Stop Words'] = df['Tweets'].str.split()
df['With Stop Words']
stop_words = stop_words_dict.values()
stop_words
df['Without Stop Words'] = df['With Stop Words'].replace(stop_words, '')
df = df[['Tweets', 'Date', 'Without Stop Words']]
return df
stop_words_remover(twitter_df.copy())
This is the output i got
TypeError Traceback (most recent call last)
C:\Users\DATASC~1\AppData\Local\Temp/ipykernel_5696/4217028502.py in <module>
15
16
---> 17 stop_words_remover(twitter_df.copy())
18 ### END FUNCTION
C:\Users\DATASC~1\AppData\Local\Temp/ipykernel_5696/4217028502.py in stop_words_remover(df)
4 stop_words = stop_words_dict.values()
5
----> 6 df['Without Stop Words'] = df['With Stop Words'].replace(stop_words, '', stop_words())
7
8 df = df[['Tweets', 'Date', 'Without Stop Words']]
TypeError: 'dict_values' object is not callable
This is the expected output
stop_words_remover(twitter_df.copy())
Tweets Date Without Stop Words
0 #BongaDlulane Please send an email to mediades... 2019-11-29 12:50:54 [#bongadlulane, send, email, mediadesk#eskom.c...
1 #saucy_mamiie Pls log a call on 0860037566 2019-11-29 12:46:53 [#saucy_mamiie, pls, log, 0860037566]
2 #BongaDlulane Query escalated to media desk. 2019-11-29 12:46:10 [#bongadlulane, query, escalated, media, desk.]
3 Before leaving the office this afternoon, head... 2019-11-29 12:33:36 [leaving, office, afternoon,, heading, weekend...
4 #ESKOMFREESTATE #MEDIASTATEMENT : ESKOM SUSPEN... 2019-11-29 12:17:43 [#eskomfreestate, #mediastatement, :, eskom, s...
... ... ... ...
195 Eskom's Visitors Centres’ facilities include i... 2019-11-20 10:29:07 [eskom's, visitors, centres’, facilities, incl...
196 #Eskom connected 400 houses and in the process... 2019-11-20 10:25:20 [#eskom, connected, 400, houses, process, conn...
197 #ArthurGodbeer Is the power restored as yet? 2019-11-20 10:07:59 [#arthurgodbeer, power, restored, yet?]
198 #MuthambiPaulina #SABCNewsOnline #IOL #eNCA #e... 2019-11-20 10:07:41 [#muthambipaulina, #sabcnewsonline, #iol, #enc...
199 RT #GP_DHS: The #GautengProvince made a commit... 2019-11-20 10:00:09 [rt, #gp_dhs:, #gautengprovince, commitment, e...
Please can someone help me?
there a simple way to do this in a single command using apply lambda:
twitter_df["Tweets"].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words_dict["stopwords"]]))
If you prefer create a function to do this, the function could be:
def remove_stop_words(tweet, stop_words_dict):
sentence = tweet.split()
output = []
for word in sentence:
if word not in stop_words_dict["stopwords"]:
output.append(word)
return " ".join(output)
twitter_df["Tweets"].apply(lambda x: remove_stop_words(x, stop_words_dict))

How to solve this problem with distilbert tokenizer?

from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenized_input = tokenizer(
sentences, truncation=True, is_split_into_words=True, padding='max_length', max_length=120
)
Sentences is a list containing lists inside
for sen in sentences[:5]:
print(sen)
['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
['Families', 'of', 'soldiers', 'killed', 'in', 'the', 'conflict', 'joined', 'the', 'protesters', 'who', 'carried', 'banners', 'with', 'such', 'slogans', 'as', '"', 'Bush', 'Number', 'One', 'Terrorist', '"', 'and', '"', 'Stop', 'the', 'Bombings', '.', '"']
['They', 'marched', 'from', 'the', 'Houses', 'of', 'Parliament', 'to', 'a', 'rally', 'in', 'Hyde', 'Park', '.']
['Police', 'put', 'the', 'number', 'of', 'marchers', 'at', '10,000', 'while', 'organizers', 'claimed', 'it', 'was', '1,00,000', '.']
['The', 'protest', 'comes', 'on', 'the', 'eve', 'of', 'the', 'annual', 'conference', 'of', 'Britain', "'s", 'ruling', 'Labor', 'Party', 'in', 'the', 'southern', 'English', 'seaside', 'resort', 'of', 'Brighton', '.']
I get this error
<ipython-input-79-1d6d1ec05183> in <module>()
2 tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
3 tokenized_input = tokenizer(
----> 4 sentences, truncation=True, is_split_into_words=True, padding='max_length', max_length=120
5 )
2 frames
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in _batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose)
407 batch_text_or_text_pairs,
408 add_special_tokens=add_special_tokens,
--> 409 is_pretokenized=is_split_into_words,
410 )
411
TypeError: PreTokenizedEncodeInput must be Union[PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]]
It works for slices of sentences
for example
tokenized_input = tokenizer(
sentences[:76], truncation=True, is_split_into_words=True, padding='max_length', max_length=120
)
compiles just fine but if i do sentences[:77] there's an error

How to print a list of tokenized text into a file

from urllib import request
from redditscore.tokenizer import CrazyTokenizer
tokenizer = CrazyTokenizer()
url = "http://www.site.uottawa.ca/~diana/csi5386/A1_2020/microblog2011.txt"
for line in request.urlopen(url):
tokens = tokenizer.tokenize(line.decode('utf-8'))
#print(tokens)
with open('your_file.txt', 'a') as f:
print(tokens)
for item in tokens:
f.write("%s\n" % item)
In the above code my output is in variable tokens in the form of list.
Output sample:
['\ufeffsave', 'bbc', 'world', 'service', 'from', 'savage', 'cuts']
['a', 'lot', 'of', 'people', 'always', 'make', 'fun', 'about', 'the', 'end', 'of', 'the', 'world', 'but', 'the', 'question', 'is', '"are', 'u', 'ready', 'for', 'it']
['rethink', 'group', 'positive', 'in', 'outlook', 'technology', 'staffing', 'specialist', 'the', 'rethink', 'group', 'expects', 'revenues', 'to', 'be']
Now i'm trying to print this output into a text file.
How can i do that? please help..
with open('your_file.txt', 'a') as f:
for line in request.urlopen(url):
tokens = tokenizer.tokenize(line.decode('utf-8'))
#print(tokens)
for item in tokens:
f.write("%s\n" % item)
Just use ' '.join with each token item
like the following (I am assuming that I already have the data in an array):
tokens = [
['\ufeffsave', 'bbc', 'world', 'service', 'from', 'savage', 'cuts'],
['a', 'lot', 'of', 'people', 'always', 'make', 'fun', 'about', 'the', 'end',
'of', 'the', 'world', 'but', 'the', 'question', 'is', '"are', 'u', 'ready',
'for', 'it'],
['rethink', 'group', 'positive', 'in', 'outlook', 'technology', 'staffing',
'specialist', 'the', 'rethink', 'group', 'expects', 'revenues', 'to', 'be']
]
with open('your_file.txt', 'a') as f:
print(tokens)
for item in tokens:
f.write("%s\n" % ' '.join(item))

Pandas not dividing length of cells

Been struggling with this problem for a long time. I have a dataframe that looks like this:
dataframe pic
I'm trying to divide the length of each 'counter' by the length of each 'content'. I thought this would be fairly straightforward. So far I've tried:
reviews['diversity'] = reviews['counter'].apply(lambda x: 0 if len(x) == 0 else float(len(x)) / float(len(reviews['content'][x])))
as well as using x['content']. I get the massive error message KeyError: "None of [['aberfeldy', 'recorded', 'their', 'debut', 'young', 'forever', 'using', 'a', 'single', 'microphone', 'good', 'for', 'them', 'in', 'that', 'spirit', 'i', 'cut', 'short', 'my', 'obligatory', 'introduction', 'and', 'bring', 'you', 'straight', 'to', 'the', 'edinburgh', 'group', 'lovelorn', 'unfortunately', 'still', 'heart', 'exposed', 'by', 'oh', 'production', 'love', 'is', 'verb', 'noun', 'as', 'well', 'find', 'it', 'dictionary', 'under', 'l', 'little', 'witticism', 'comes', 'from', 'an', 'arrow', 'written', 'sung', 'riley', 'briggs', 'based', 'on', 'one', 'photo', 'looks', 'like', 'anthony', 'michael', 'hall', 'though', 'his', 'vocals', 'chart', 'fairly', 'standard', 'indie', 'course', 'borrowing', 'neil', 'friend', 'ben', 'gibbard', 'what', 'do', 'plain', 'sensitive', 'guys', 'everywhere', 'listen', 'some', 'of', 'best', 'friends', 'are', 'favorite', 'albums', 'consist', 'campfire', 'singalongs', 'bands', 'with', 'modest', 'acoustic', 'guitar', 'chops', 'cute', 'names', 'accents', 'but', 'those', 'lyrics', 'no', 'band', 'would', 'sing', 'such', 'words', 'deserves', 'easily', 'made', 'comparisons', 'fellow', 'scots', 'belle', '', 'sebastian', 'or', 'even', 'camera', 'obscura', 'let', 'alone', 'earnest', 'aussies', 'lucksmiths', 'compare', 'twee', 'progenitors', 'pastels', 'talulah', 'gosh', 'owe', 'me', 'your', 'cardigan', 'moniker', 'nipped', 'scottish', 'vacation', 'destination', 'practically', 'beg', 'name', 'there', 'need', 'encourage', 'throughout', 'record', 'shows', 'predisposition', 'toward', 'bungling', 'old', 'english', 'teachers', 'motto', 'show', 'not', 'tell', 'this', 'may', 'be', 'result', 'medical', 'condition', 'dyslexia', 'which', 'case', 'we', 'should', 'hold', 'our', 'snark', 'seems', 'guy', 'can', 'open', 'mouth', 'without', 'saying', 'nothing', 'so', 'sad', 'leaving', 'he', 'sings', 'out', 'lonely', 'now', 'she', 'gone', 'adds', 'tie', 'teems', 'vivid', 'storytelling', 'goes', 'rhyme', 'sacred', 'wasted', 'reasons', 'until', 'somewhere', 'editor', 'rhyming', 'loses', 'her', 'job', 'often', 'at', 'when', 'they', 'stumble', 'beyond', 'trite', 'infantilism', 'first', 'vegetarian', 'restaurant', 'lopes', 'along', 'winning', 'tangled', 'up', 'blue', 'strums', 'accented', 'subtle', 'fiddles', 'lovely', 'boy', 'harmonies', 'seemingly', 'aiming', 'album', 'cheerful', 'unpretentious', 'look', 'everyday', 'here', 'finally', 'makes', 'interesting', 'way', 'dance', 'kitchen', 'says', 'willing', 'see', 'where', 'takes', 'him', 'then', 'proclaims', 'sometimes', 'believe', 'human', 'duck', 'cover', 'speaking', 'aliens', 'heliopolis', 'night', 'next', 'track', 'incidentally', 'its', 'second', 'whimsical', 'spaceship', 'song', 'complete', 'nose', 'perfect', 'unique', 'yeah', 'was', 'means', 'warm', 'pop', 'heats', 'headphones', 'veritable', 'help', 'root', 'begins', 'everyone', 'because', 'last', 'thing', 'world', 'needs', 'another', 'batch', 'sullen', 'scenesters', 'yet', 'any', 'relationship', 'just', 'someone', 'doesn', 'mean', 'back', 'beautiful', 'gibbs', 'tells', 'us', 'tender', 'moment', 'probably', 'if', 'hope', 'gets', 'laid']] are in the [index]".
I've tried:
def diverse(x):
if len(x) == 0:
return 0
else:
return float(len(x)) / float(len(reviews['clean'][x]))
reviews['diverse'] = reviews['counter'].apply(diverse)
and get the same thing.
I've tried using applymap with reviews['diversity'] = reviews.applymap(lambda x: 0 if len(x) == 0 else float(len(reviews['counter'][x])) / float(len(reviews['content'][x])))
and get ("object of type 'int' has no len()", 'occurred at index Unnamed: 0').
And yet if I just do float(len(reviews['counter'][4])) / float(len(reviews['clean'][4])), I get 0.634375.
Any help is much appreciated.
edit: I tried:
def test(x, y):
for row, item in x.iteritems():
x = float(len(item))
for row, item in y.iteritems():
if len(item) == 0:
return (0)
else:
y = float(len(item))
return (x/y)`
When I used "print" instead of "return", it gave me all the values. But return only divides the length of the first row, which seems really weird?
Here is toy example I constructed to show how to do what you are asking:
import pandas as pd
from collections import Counter
df = pd.DataFrame([['hello world i am a computer'],
['hello i am a computer too hello computer']],
columns=['content'])
df['counter'] = df.content.str.split().apply(Counter)
df
# returns:
content counter
hello world i am a computer {'am': 1, 'hello': 1, 'computer': 1, 'world': ...
hello i am a computer too hello computer {'am': 1, 'hello': 2, 'computer': 2, 'a': 1, '...
This line answers the question as you phrased it:
df['diversity'] = df.content.str.len() / df.counter.apply(len)
But I think what you really wanted was to break the strings in content into a list of words by splitting on the space character. In that case, you probably want:
df['diversity'] = df.content.str.split().apply(len) / df.counter.apply(len)

Categories

Resources