Python Pandas handle special characters in strings - python

I write a function which I want to apply to a dataframe later.
def get_word_count(text,df):
#text is a lowercase list of words
#df is a dataframe with 2 columns: word and count
#this function updates the word counts
#f=open('stopwords.txt','r')
#stopwords=f.read()
stopwords='in the and an - '
for word in text:
if word not in stopwords:
if df['word'].str.contains(word).any():
df.loc[df['word']==word, 'count']=df['count']+1
else:
df.loc[0]=[word,1]
df.index=df.index+1
return df
Then I check it:
word_df=pd.DataFrame(columns=['word','count'])
sentence1='[first] - missing "" in the text [first] word'.split()
y=get_word_count(sentence1, word_df)
sentence2="error: wrong word in the [second] text".split()
y=get_word_count(sentence2, word_df)
y
I get the following results:
Word Count
[first] 2
missing 1
"" 1
text 2
word 2
error: 1
wrong 1
So where is [second] from the sentence2?
If I omit one of square brackets I get an error message. How do I handle words with special characters? Note that I don't want to get rid of them because if I do, I will miss "" in the sentence1.

The problem comes from the line:
if df['word'].str.contains(word).any():
This reports if any of the words in the word column contains the given word. The DataFrame from df['word'].str.contains(word) reports True when [second] is given and compared to specifically [first].
For a quick fix, I changed the line to:
if word in df['word'].tolist():

Creating a DataFrame in a loop like that is not recommended, you should do something like this:
stopwords='in the and an - '
sentence = sentence1+sentence2
df = pd.DataFrame([sentence.split()]).T
df.rename(columns={0: 'Words'}, inplace=True)
df = df.groupby(by=['Words'])['Words'].size().reset_index(name='counts')
df = df[~df['Words'].isin(stopwords.split())]
print(df)
Words counts
0 "" 1
2 [first] 2
3 [second] 1
4 error: 1
6 missing 1
7 text 2
9 word 2
10 wrong 1

I have rebuild it in a way you can add sentences and see the frequency growing
from collections import Counter
from collections import defaultdict
import pandas as pd
def terms_frequency(corpus, stop_words=None):
'''
Takes in texts and returns a pandas DataFrame of words frequency
'''
corpus_ = corpus.split()
# remove stop wors
terms = [word for word in corpus_ if word not in stop_words]
terms_freq = pd.DataFrame.from_dict(Counter(terms), orient='index').reset_index()
terms_freq = terms_freq.rename(columns={'index':'word', 0:'count'}).sort_values('count',ascending=False)
terms_freq.reset_index(inplace=True)
terms_freq.drop('index',axis=1,inplace=True)
return terms_freq
def get_sentence(sentence, storage, stop_words=None):
storage['sentences'].append(sentence)
corpus = ' '.join(s for s in storage['sentences'])
return terms_frequency(corpus,stop_words)
# tests
STOP_WORDS = 'in the and an - '
storage = defaultdict(list)
S1 = '[first] - missing "" in the text [first] word'
print(get_sentence(S1,storage,STOP_WORDS))
print('\nNext S2')
S2 = 'error: wrong word in the [second] text'
print(get_sentence(S2,storage,STOP_WORDS))

Related

How to remove Rows that have English (or a specific languages) sentences in pandas

I have a pandas data frame which has 2 columns, first contains Arabic sentences and the second one contain labels (1,0)
I want to remove all rows that contain English sentences.
Any suggestions?
Here is an example, I want to remove the second row
إيطاليا لتسريع القرارات اللجوء المهاجرين، الترحيل [0]
Border Patrol Agents Recover 44 Migrants from Stash House [0]
الديمقراطيون مواجهة الانتخابات رفض عقد اجتماعات تاون هول [0]
شاهد لايف: احتفال ترامب "اجعل أمريكا عظيمة مرة أخرى" - بريتبارت [0]
المغني البريطاني إم آي إيه: إدارة ترامب مليئة بـ "كذابون باثولوجي" [0]
You can create an array of common English letters and remove a line which contain either of these letters, like this:
ENGletters = ['a', 'o', 'i']
df = df[~df['text_col'].str.contains('|'.join(ENGletters))]
You could make a list of English words and detect when a word is used.
letters = 'hello how a thing person'.split(' ')
def lookForLetters(text):
text = text.split(' ')
for i in letters:
if i.lower() in text:
return(True)
return(False)
print(lookForLetters("المغني البريطاني إم آي إيه: إدارة ترامب مليئة بـ "كذابون باثولوجي"))
print(lookForLetters("Hello sir! How are you doing?"))
The first print line will return False and the second print line will return True.
Be sure to add more common English words to the first list.
You can use langdetect to filter your df.
from langdetect import detect
Df['language'] = df['title'].apply(detect)
Then filter you df to remove non arabic titles.
df = df[df['language']=='ar']

How to determine the number of negation words per sentence

I would like to know how to count how many negative words (no, not) and abbreviation (n't) there are in a sentence and in the whole text.
For number of sentences I am applying the following one:
df["sent"]=df['text'].str.count('[\w][\.!\?]')
However this gives me the count of sentences in a text. I would need to look per each sentence at the number of negation words and within the whole text.
Can you please give me some tips?
The expected output for text column is shown below
text sent count_n_s count_tot
I haven't tried it yet 1 1 1
I do not like it. What do you think? 2 0.5 1
It's marvellous!!! 1 0 0
No, I prefer the other one. 2 1 1
count_n_s is given by counting the total number of negotiation words per sentence, then dividing by the number of sentences.
I tried
split_w = re.split("\w+",df['text'])
neg_words=['no','not','n\'t']
words = [w for i,w in enumerate(split_w) if i and (split_w[i-1] in neg_words)]
This would get a count of total negations in the text (not for individual sentences):
import re
NEG = r"""(?:^(?:no|not)$)|n't"""
NEG_RE = re.compile(NEG, re.VERBOSE)
def get_count(text):
count = 0
for word in text:
if NEG_RE .search(word):
count+=1
continue
else:
pass
return count
df['text_list'] = df['text'].apply(lambda x: x.split())
df['count'] = df['text_list'].apply(lambda x: get_count(x))
To get count of negations for individual lines use the code below. For words like haven't you can add it to neg_words since it is not a negation if you strip the word of everything else if it has n't
import re
str1 = '''I haven't tried it yet
I do not like it. What do you think?
It's marvellous!!!
No, I prefer the other one.'''
neg_words=['no','not','n\'t']
for text in str1.split('\n'):
split_w = re.split("\s", text.lower())
# to get rid of special characters such as comma in 'No,' use the below search
split_w = [re.search('^\w+', w).group(0) for w in split_w]
words = [w for w in split_w if w in neg_words]
print(len(words))

How to remove duplicate words only from string with str.join() leaving numbers/digits intact?

I'm trying to figure out and put together a somewhat complicated syntax (for me) with .join function for hours already but just can't get it to work.
The task is to remove all duplicate words from a string obtained through scraping process but leave all duplicate numbers and digits intact.
Example Code:
from collections import OrderedDict
examplestring = 'Your Brand22 For Awesome Product 1 Year 1 User Subscription Brand22'
print(' '.join(OrderedDict((w,w) for w in examplestring.split()).keys()))
>>> Your Brand22 For Awesome Product 1 Year User Subscription
Note that the above code works but removes the duplicated 1 (1 Year 1 User) too, which I need. I'm trying to leave the numbers intact by comparing it to isdigit() function as .split() goes through the string word by word but cannot figure it out what is the proper syntax for it.
result = ' '.join(OrderedDict((w,w) for w in examplestring.split()).keys() if w not isdigit())
result = ([' '.join(OrderedDict((w,w) for w in examplestring.split()).keys())] if w not isdigit())
result = ' '.join([(OrderedDict((w,w) for w in examplestring.split()).keys()] if w not isdigit()))
I tried many more different variations of the above one-liner code and might be even missing the if statement, but these brackets everywhere confuse me so I'm grateful if anyone can help me out.
Goal: Remove duplicate words but keep repeated digits/numbers inside the string
You can Solve the problem by modifying the keys if the key is a number. Here I'm using enumerate to modify the key if key is numeric.
examplestring = 'Your Brand22 For Awesome Product 1 Year 1 User Subscription Brand22'
res = ' '.join(OrderedDict(((word + str(idx) if word.isnumeric() else word), word) for idx, word in enumerate(examplestring.split())).values())
print(res)
Output:
Your Brand22 For Awesome Product 1 Year 1 User Subscription
Does this work for you?
example_str = '''Your Brand22 For Awesome Product 1 Year 1 User Subscription Brand22'''
words_list = example_str.split()
numeric_flags_list = [all([char.isnumeric() for char in word]) for word in words_list]
unique_words = []
for word, numeric_flag in zip(words_list, numeric_flags_list):
if numeric_flag:
unique_words.append(word)
else:
if word not in unique_words:
unique_words.append(word)
else:
continue

Counting the number of times a word appears in n tweets

I have a data frame of about 118,000 tweets. Here's a made up sample:
Tweets
1 The apple is red
2 The grape is purple
3 The tree is green
I have also used the 'set' function to arrive at a list of of every unique word that is found in my data frame of tweets. For the example above it looks like this (in no particular order):
Words
1 The
2 is
3 apple
4 grape
....so on
Basically I need to find out how many tweets contain a given word. For example, "The" is found in 3 tweets, "apple" is found in 1 tweet, "is" is found in 3 tweets, and so on.
I have tried using a nested for loop that looks like:
number_words = [0]*len(words)
for i in range(len(words)):
for j in range(len(tweets)):
if words[i] in tweets[j]:
number_words[i] += 1
number_words
Which creates a new list and counts the amount of tweets that contain the given word, for each word down the list. However I have found that this incredibly inefficient, the code block takes forever to run.
What is a better way to do this?
you could use: str.count
df.Tweets.str.count(word).sum()
for example, i suppose words is the list
for word in Words:
print(f'{word} count: {df.Tweets.str.count(word).sum()}')
full sample:
import pandas as pd
data = """
Tweets
The apple is red
The grape is purple
The tree is green
"""
datb = """
Words
The
is
apple
grape
"""
dfa = pd.read_csv(pd.compat.StringIO(data), sep=';')
dfb = pd.read_csv(pd.compat.StringIO(datb), sep=';')
Words = dfb['Words'].values
dico = {}
for word in Words:
dico[word] = dfa.Tweets.str.count(word).sum()
print(dico)
output:
{'The': 3, 'is': 3, 'apple': 1, 'grape ': 1}
You can use a default dictionary for this to store all word counts like this:
from collections import defaultdict
word_counts = defaultdict(int)
for tweet in tweets:
for word in tweet:
word_counts[word] += 1
# print(word_counts['some_word']) will output occurrence of some_word
This will take your list of words and turn it into a dictionary
import collections
words = tweets.split()
counter = collections.Counter(words)
for key , value in sorted(counter.items()):
print("`{}` is repeated {} time".format(key , value))

To replace internet acronyms in a dataframe using dictionary

I'm working on a text mining project where I'm trying to replace abbreviations, slang words and internet acronyms present in text (In a dataframe column) using a manually prepared dictionary.
The problem I'm facing is the code stops with the first word of the text in the dataframe column and does not replace it with lookup words from dict
Here is the sample dictionary and code I use:
abbr_dict = {"abt":"about", "b/c":"because"}
def _lookup_words(input_text):
words = input_text.split()
new_words = []
for word in words:
if word.lower() in abbr_dict:
word = abbr_dict[word.lower()]
new_words.append(word)
new_text = " ".join(new_words)
return new_text
df['new_text'] = df['text'].apply(_lookup_words)
Example Input:
df['text'] =
However, industry experts are divided ab whether a Bitcoin ETF is necessary or not.
Desired Output:
df['New_text'] =
However, industry experts are divided about whether a Bitcoin ETF is necessary or not.
Current Output:
df['New_text'] =
However
You can try as following with using lambda and join along with split:
import pandas as pd
abbr_dict = {"abt":"about", "b/c":"because"}
df = pd.DataFrame({'text': ['However, industry experts are divided abt whether a Bitcoin ETF is necessary or not.']})
df['new_text'] = df['text'].apply(lambda row: " ".join(abbr_dict[w]
if w.lower() in abbr_dict else w for w in row.split()))
Or to fix the code above, I think you need to move the join for new_text and return statement outside of the for loop:
def _lookup_words(input_text):
words = input_text.split()
new_words = []
for word in words:
if word.lower() in abbr_dict:
word = abbr_dict[word.lower()]
new_words.append(word)
new_text = " ".join(new_words) # ..... change here
return new_text # ..... change here also
df['new_text'] = df['text'].apply(_lookup_words)

Categories

Resources