Counting the top 10 most frequent words per row - python

I have the sample dataset like this:
"Author", "Normal_Tokenized"
x , ["I","go","to","war","I",..]
y , ["me","you","and","us",..]
z , ["let","us","do","our","best",..]
I want a dataframe reporting the 10 most frequent words and the counts (frequencies) for each author:
"x_text", "x_count", "y_text", "y_count", "z_text", "z_count"
go , 1000 , come , 120 , let , 12
and so on ...
I attempted with the following snippet, but it just take the last author values instead of all authors values.
This code actually return the 10 most common word the author has been used in his novel
df_words = pd.concat([pd.DataFrame(
data={'Author': [row['Author'] for _ in row['Normal_Tokenized']], 'Normal_Tokenized': row['Normal_Tokenized']})
for idx, row in df.iterrows()], ignore_index=True)
df_words = df_words[~df_words['Normal_Tokenized'].isin(stop_words)]
def authorCommonWords(numWords):
for author in authors:
authorWords = df_words[df_words['Author'] == author].groupby('Normal_Tokenized').size().reset_index().rename(
columns={0: 'Count'})
authorWords.sort_values('Count', inplace=True)
df = pd.DataFrame(authorWords[-numWords:])
df.to_csv("common_word.csv", header=False,mode='a', encoding='utf-8',
index=False)
return authorWords[-numWords:]
authorCommonWords(10)
There are about 130000 samples for each author. The example get the 10 word that is most repeated word in this 130000 sample. I want this 10 words in separated column for each author.

np.unique(return_counts=True) seems to be what you're looking for.
Data
import numpy as np
import pandas as pd
df = pd.DataFrame({
"Author": ["x", "y", "z"],
"Normal_Tokenized": [["I","go","to","war","I"],
["me","you","and","us"],
["let","us","do","our","best"]]
})
Code
n_top = 6 # count top n
df_want = pd.DataFrame(index=range(n_top))
for au, ls in df.itertuples(index=False, name=None):
words, freqs = np.unique(ls, return_counts=True)
len_words = len(words)
if len_words >= n_top:
df_want[f"{au}_text"] = words[:n_top]
df_want[f"{au}_count"] = freqs[:n_top]
else: # too few distinct words
df_want[f"{au}_text"] = [words[i] if i < len_words else "" for i in range(n_top)]
df_want[f"{au}_count"] = [freqs[i] if i < len_words else 0 for i in range(n_top)]
Result
print(df_want)
x_text x_count y_text y_count z_text z_count
0 I 2 and 1 best 1
1 go 1 me 1 do 1
2 to 1 us 1 let 1
3 war 1 you 1 our 1
4 0 0 us 1
5 0 0 0

Related

Find character at which string can be differentiated from list of strings

For every string in a df column, I need the character at which this string becomes unique, that is, its uniqueness point (UP). For illustration, here is a toy dataframe:
import pandas as pd
import numpy as np
df = pd.DataFrame({
'word':['can', 'cans', 'canse', 'canpe', 'canp', 'camp'],
'code':['k#n', 'k#n}', 'k#(z', np.nan, 'k#()', np.nan]})
word code
0 can k#n
1 cans k#n}
2 canse k#(z
3 canpe
4 canp k#()
5 camp
The expected result is given below. I computed the UP for the two columns word and code:
word code wordUP codeUP
0 can k#n 4 4 # 'can' can be discriminated from 'cans' at the imagined fourth letter, which does not exist
1 cans k#n} 5 4
2 canse k#(z 5 4
3 canpe 5 # empty cells don't have a UP
4 canp k#() 5 4
5 camp 3
My current implementation works, but is too slow for my 100k row dataframe. You can see it below. Can you come up with something faster?
def slice_prefix(a, b, start=0, length=1):
while 1:
while a[start:start + length] == b[start:start + length]:
start += length
length += length
if length > 1:
length = 1
else:
return start
df = df.fillna('')
# get possible orthographic and phonetic representations
all_words = df['word'].dropna().to_list()
all_codes = df['code'].dropna().to_list()
# prepare new columns
df['wordUP'] = np.nan
df['codeUP'] = np.nan
# compute UP
for idx,row in df.iterrows():
word = row['word']
code = row['code']
wordUP = max([slice_prefix(word, item) for item in all_words if item != word]) + 1
codeUP = max([slice_prefix(code, item) for item in all_codes if item != code]) + 1
df.loc[idx, 'wordUP'] = wordUP
df.loc[idx, 'codeUP'] = codeUP
df.loc[df['code']=='', 'codeUP'] = 0
As it is, your code runs in 0.0012 second in average (10,000 iterations) on my computer.
I suggest you refactor your code in a way which is both faster (0.0008 second, -33%) and more idiomatic, thus readable:
import numpy as np
import pandas as pd
def slice_prefix(a, b, start=0, length=1):
while 1:
while a[start:start + length] == b[start:start + length]:
start += length
length += length
if length > 1:
length = 1
else:
return start
def find_up(x, strings):
"""New helper function"""
return int(max([slice_prefix(x, item) for item in strings if item != x])) + 1
df = df.fillna(" ")
df = (
df
.assign(wordUP=df["word"].apply(lambda x: find_up(x, df["word"])))
.assign(codedUP=df["code"].apply(lambda x: find_up(x, df["code"])))
.replace(1, "")
)
print(df)
# Output
word code wordUP codedUP
0 can k#n 4 4
1 cans k#n} 5 4
2 canse k#(z 5 4
3 canpe 5
4 canp k#() 5 4
5 camp 3

How to put the print information into a current dataframe

I am having brain fart. I wrote some code to get keywords from my data frame. It worked, but how can I put the print information into my current data frame. Thank you for the help in advance.
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
tuples = zip(coo_matrix.col, coo_matrix.data)
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
"""get the feature names and tf-idf score of top n items"""
#use only topn items from vector
sorted_items = sorted_items[:topn]
score_vals = []
feature_vals = []
# word index and corresponding tf-idf score
for idx, score in sorted_items:
#keep track of feature name and its corresponding score
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
#create a tuples of feature,score
#results = zip(feature_vals,score_vals)
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
return results
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,5)
#now print the results - NEED TO PUT THIS INFORMATION IN MY CURRENT DATAFRAME
print("\nAbstract:")
print(doc)
print("\nKeywords:")
for k in keywords:
print(k,keywords[k])
First: DataFrame is not Excel so it may not look like you may expect.
You can use append() to add new row with text. It should automatically add NaN if row is shorted. OR it will add columns with NaN if row is longer.
import pandas as pd
data = {
'X': ['A','B','C'],
'Y': ['D','E','F'],
'Z': ['G','H','I']
}
df = pd.DataFrame(data)
print(df)
df = df.append({"X": 'Abstract:'}, ignore_index=True)
df = df.append({"X": 'Keywords:'}, ignore_index=True)
keywords = {"first": 123, "second": 456, "third": 789}
for key, value in keywords.items():
df = df.append({"X": key, "Y": value}, ignore_index=True)
print(df)
Result:
# Before
X Y Z
0 A D G
1 B E H
2 C F I
# After
X Y Z
0 A D G
1 B E H
2 C F I
3 Abstract: NaN NaN
4 Keywords: NaN NaN
5 first 123 NaN
6 second 456 NaN
7 third 789 NaN
Eventually later you can replace NaN with something else - ie. empty string:
df = df.fillna('')
Result:
X Y Z
0 A D G
1 B E H
2 C F I
3 Abstract:
4 Keywords:
5 first 123
6 second 456
7 third 789

How can I fill NaN values in a dataframe with the average of the values above it?

I'm looking to make it so that NaN values in a dataframe are filled in by the mean of all the values up to that point, as such:
A
0 1
1 2
2 3
3 4
4 5
5 NaN
6 NaN
7 11
8 NaN
Would become
A
0 1
1 2
2 3
3 4
4 5
5 3
6 3
7 11
8 4
You can solve it by running the following code
import numpy as np
import pandas as pd
df = pd.DataFrame({
"A": [ 1, 2, 3, 4, 5, pd.NA, pd.NA, 11, pd.NA ]
})
for idx in df[pd.isna(df["A"])].index:
df.loc[idx, "A"] = np.mean(df.loc[ : idx, "A" ])
It iterates on each NaN and fills it with the mean of the previous values, including those filled NaNs.
At the end you will have:
>>> df
A
0 1
1 2
2 3
3 4
4 5
5 3
6 3
7 11
8 4
EDIT
As stated by RichieV, performance may be an issue with this solution (its runtime complexity is O(N^2)) when there are many NaNs, but we also should avoid python iterations, since they are slow when compared to native pandas / numpy calls.
Here is an optimized version:
last_idx = None
cumsum = 0
cumnum = 0
for idx in df[pd.isna(df["A"])].index:
prev_values = df.loc[ last_idx : idx, "A" ]
# for some reason, pandas includes idx on the slice, so we remove it
prev_values = prev_values[ : -1 ]
cumsum += prev_values.sum()
cumnum += len(prev_values)
df.loc[idx, "A"] = int(cumsum / cumnum)
last_idx = idx
Result:
>>> df
A
0 1
1 2
2 3
3 4
4 5
5 3
6 3
7 11
8 4
Since in the worst case the script should pass on the dataframe twice, the runtime complexity is now O(N).
Marco's answer works fine but it can be optimized with incremental average formulas, from math.stackexchange.com
Here is an adaptation of that other question (not the exact formula, just the concept).
cumsum = 0
expanding_mean = []
for i, xi in enumerate(df['A']):
if pd.isna(xi):
mean = cumsum / i # divide by number of items up to previous row
expanding_mean.append(mean)
cumsum += mean
else:
cumsum += xi
df.loc[df['A'].isna(), 'A'] = expanding_mean
The main advantage with this code is not having to read all items up to the current index on each iteration to get the mean.
This option still uses a python loop--which is not the best choice with pandas--but there seems to be no way around it for this use case (hopefully someone will get inspired by this and find such method without a loop).
Performance tests
Three alternative functions were defined:
incremental: My answer.
from_origin: Marco's original answer.
incremental_pandas: Marco's updated answer.
Tests were done using timeit module with 3 repetitions on random samples with 0.4 probability of NaN.
Full code for testing
import pandas as pd
import numpy as np
import timeit
import collections
from matplotlib import pyplot as plt
def incremental(df: pd.DataFrame):
# error handling
if pd.isna(df.iloc[0, 0]):
df.iloc[0, 0] = 0
cumsum = 0
expanding_mean = []
for i, xi in enumerate(df['A']):
if pd.isna(xi):
mean = cumsum / i # divide by number of items up to previous row
expanding_mean.append(mean)
cumsum += mean
else:
cumsum += xi
df.loc[df['A'].isna(), 'A'] = expanding_mean
return df
def incremental_pandas(df: pd.DataFrame):
# error handling
if pd.isna(df.iloc[0, 0]):
df.iloc[0, 0] = 0
last_idx = None
cumsum = 0
cumnum = 0
for idx in df[pd.isna(df["A"])].index:
prev_values = df.loc[ last_idx : idx, "A" ]
# for some reason, pandas includes idx on the slice, so we remove it
prev_values = prev_values[ : -1 ]
cumsum += prev_values.sum()
cumnum += len(prev_values)
df.loc[idx, "A"] = cumsum / cumnum
last_idx = idx
return df
def from_origin(df: pd.DataFrame):
# error handling
if pd.isna(df.iloc[0, 0]):
df.iloc[0, 0] = 0
for idx in df[pd.isna(df["A"])].index:
df.loc[idx, "A"] = np.mean(df.loc[ : idx, "A" ])
return df
def get_random_sample(n, p):
np.random.seed(123)
return pd.DataFrame({'A':
np.random.choice(list(range(10)) + [np.nan],
size=n, p=[(1 - p) / 10] * 10 + [p])})
r = 3
p = 0.4 # portion of NaNs
# check result from all functions
results = []
for func in [from_origin, incremental, incremental_pandas]:
random_df = get_random_sample(1000, p)
new_df = random_df.copy(deep=True)
results.append(func(new_df))
print('Passed' if all(np.allclose(r, results[0]) for r in results[1:])
else 'Failed', 'implementation test')
timings = {}
for n in np.geomspace(10, 10000, 10):
random_df = get_random_sample(int(n), p)
timings[n] = collections.defaultdict(float)
results = {}
for func in ['incremental', 'from_origin', 'incremental_pandas']:
timings[n][func] = (
timeit.timeit(f'{func}(random_df.copy(deep=True))', number=r, globals=globals())
/ r
)
timings = pd.DataFrame(timings).T
print(timings)
timings.plot()
plt.xlabel('size of array')
plt.ylabel('avg runtime (s)')
plt.ylim(0)
plt.grid(True)
plt.tight_layout()
plt.show()
plt.close('all')

Pandas - How to iterate over groupby to count number of occurences

I have a DF like the below:
I want to groupby price, count the number of occurrences where action == N / U / D for each price.
ID,Action,indicator,side, price, quantity
7930249,U,0,A,132.938,23
7930251,D,0,B,132.906,2
7930251,N,1,B,132.891,36
7930251,U,0,A,132.938,22
7930252,U,0,A,132.938,2
7930252,U,1,A,132.953,39
7930252,U,2,A,132.969,17
7930514,U,0,B,132.906,1
7930514,U,0,A,132.938,8
7930514,U,1,A,132.953,38
7930515,U,0,A,132.938,18
7930515,U,2,A,132.969,7
7930516,U,1,B,132.891,37
7930516,U,0,A,132.938,28
Current code:
pricelist = []
column_names = ['Price', 'N', 'U', 'D']
df_counter = pd.DataFrame(columns = column_names)
for name, group in df.groupby('Price'):
price = name
if price not in pricelist:
pricelist.append(price)
n_count = group['Action'][group['Action']=='N'].count()
u_count = group['Action'][group['Action']=='U'].count()
d_count = group['Action'][group['Action']=='D'].count()
dflist = [price, n_count, u_count, d_count]
price_dict = {'Price':price,'N':n_count, 'U':u_count,'D':d_count}
df1 = pd.DataFrame([price_dict], columns=price_dict.keys())
result = df_counter.append(df1)
continue
else:
continue
Returning:
Price N U D
0 136.938 1 0 0
Why is it not creating a longer data frame? I basically have the result of I print out the price_dict, however, am struggling to save it to a Dataframe.
IIUC, try using pd.crosstab, instead of coding your own method:
pd.crosstab(df['price'], df['Action'])
Output:
Action D N U
price
132.891 0 1 1
132.906 1 0 1
132.938 0 0 6
132.953 0 0 2
132.969 0 0 2

Python: How to change same numbers in a Series/Column to other values?

I am trying to change the values of a very long column (about 1mio entries) in a data frame. I have something like
####ID_Orig
3452
3452
3452
6543
6543
...
I want something like
####ID_new
0
0
0
1
1
...
At the moment I'm doing this:
j=0
for i in range(0,1199531):
if data.ID_orig[i]==data.ID_orig[i+1]:
data.ID_orig[i] = j
else:
data.ID_orig[i] = j
j=j+1
Which takes about ages... Is there a faster way to do this?
I don't know what values ID_orig has and how often a single value comes up.
Use factorize, but if duplicated groups then output values are set to same number.
Another solution with comparing by ne (!=) of shifted values with cumsum is more general - create always new values, also if repeating group values:
df['ID_new1'] = pd.factorize(df['ID_Orig'])[0]
df['ID_new2'] = df['ID_Orig'].ne(df['ID_Orig'].shift()).cumsum() - 1
print (df)
ID_Orig ID_new1 ID_new2
0 3452 0 0
1 3452 0 0
2 3452 0 0
3 6543 1 1
4 6543 1 1
5 100 2 2
6 100 2 2
7 6543 1 3 <-repeating group
8 6543 1 3 <-repeating group
You can do this …
import collections
l1 = [3452, 3452, 3452, 6543, 6543]
c = collections.Counter(l1)
l2 = list(c.items())
l3 = []
for i, t in enumerate(l2):
for x in range(t[1]):
l3.append(i)
for x in l3:
print(x)
This is the output:
0
0
0
1
1
You can use the following. In the following implementation duplicate ids in the original id will get same ids. The implementation is based on dropping duplicates from the column and assigning a different number to each unique id to form the enw ids. These new ids are then merged into the original dataset
import numpy as np
import pandas as pd
from time import time
num_rows = 119953
input_data = np.random.randint(1199531, size=(num_rows,1))
data = pd.DataFrame(input_data)
data.columns = ["ID_orig"]
data2 = pd.DataFrame(input_data)
data2.columns = ["ID_orig"]
t0 = time()
j=0
for i in range(0,num_rows-1):
if data.ID_orig[i]==data.ID_orig[i+1]:
data.ID_orig[i] = j
else:
data.ID_orig[i] = j
j=j+1
t1 = time()
id_new = data2.loc[:,"ID_orig"].drop_duplicates().reset_index().drop("index", axis=1)
id_new.reset_index(inplace=True)
id_new.columns = ["id_new"] + id_new.columns[1:].values.tolist()
data2 = data2.merge(id_new, on="ID_orig")
t2 = time()
print("Previous: ", round(t1-t0, 2), " seconds")
print("Current : ", round(t2-t1, 2), " seconds")
The output of the above program using only 119k rows is
Previous: 12.16 seconds
Current : 0.06 seconds
The runtime difference increases even more as the number of rows are increased.
EDIT
Using the same number of rows:
>>> print("Previous: ", round(t1-t0, 2))
Previous: 11.7
>>> print("Current : ", round(t2-t1, 2))
Current : 0.06
>>> print("jezrael's answer : ", round(t3-t2, 2))
jezrael's answer : 0.02

Categories

Resources