I'm stuck on trying to add values to the numbers column.
import pandas as pd
def twod_array(num):
data = {"group": [-1, 0, 1, 2],
'numbers': [[2], [14, 15], [16, 17], [19, 20, 21]],
}
df = pd.DataFrame(data=data)
print(df)
return 0
Currently it prints this:
group numbers
0 -1 [2]
1 0 [14, 15]
2 1 [16, 17]
3 2 [19, 20, 21]
What I'd like to do is to add a value based on the passed input, so for example if I pass 14.5 as a num, this is the output I'd like to see:
group numbers
0 -1 [2]
1 0 [14,14.5 15]
2 1 [16, 17]
3 2 [19, 20, 21]
I'm hoping someone can help with this. This is what I have so far but it fails at the insert line with the error "numpy.ndarray" object has no attribute 'insert'.
df = pd.DataFrame({"group": [-1, 0, 1, 2],
'numbers': [[2], [14, 15], [16, 17], [19, 20, 21]],
})
arr = df['numbers'].to_list()
num = 14.5
for i, sub_arr in enumerate(arr):
for j, n in enumerate(sub_arr):
if arr[i][j]>num:
if j!=0:
arr[i].insert(j,num)
else: arr[i-1].insert(-1 ,num)
df['numbers'] = arr
num = 14.5
mask = (df.numbers.apply(min).lt(num) &
df.numbers.apply(max).gt(num))
index = mask[mask].index[0]
df.numbers.at[index].append(num)
df.numbers.at[index].sort()
print(df)
# Output:
group numbers
0 -1 [2]
1 0 [14, 14.5, 15]
2 1 [16, 17]
3 2 [19, 20, 21]
Iterate through the df and see if num lies between the first and last value of the numbers column. If it does, use the bisect module to insert num in a sorted fashion.
import bisect
for i in range(len(df)):
if num >= df.loc[i,'numbers'][0] and num<= df.loc[i,'numbers'][-1]:
bisect.insort(df.loc[i,'numbers'],num)
print(df)
group numbers
0 -1 [2]
1 0 [14, 14.5, 15]
2 1 [16, 17]
3 2 [19, 20, 21]
Related
I have two CSV, one is the Master-Data and the other is the Component-Data, Master-Data has Two Rows and two columns, where as Component-Data has 5 rows and two Columns.
I'm trying to find the cosine-similarity between each of them after Tokenization, Stemming and Lemmatization and then append the similarity index to the new columns, I'm unable to append the corresponding values to the column in the data-frame which is further needs to be converted to CSV.
My Approach:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from collections import Counter
import pandas as pd
portStemmer=PorterStemmer()
wordNetLemmatizer = WordNetLemmatizer()
fields = ['Sentences']
cosineSimilarityList = []
def fetchLemmantizedWords():
eliminatePunctuation = re.sub('[^a-zA-Z]', ' ',value)
convertLowerCase = eliminatePunctuation.lower()
tokenizeData = convertLowerCase.split()
eliminateStopWords = [word for word in tokenizeData if not word in set(stopwords.words('english'))]
stemWords= list(set([portStemmer.stem(value) for value in eliminateStopWords]))
wordLemmatization = [wordNetLemmatizer.lemmatize(x) for x in stemWords]
return wordLemmatization
def fetchCosine(eachMasterData,eachComponentData):
masterDataValues = Counter(eachMasterData)
componentDataValues = Counter(eachComponentData)
bagOfWords = list(masterDataValues.keys() | componentDataValues.keys())
masterDataVector = [masterDataValues.get(bagOfWords, 0) for bagOfWords in bagOfWords]
componentDataVector = [componentDataValues.get(bagOfWords, 0) for bagOfWords in bagOfWords]
masterDataLength = sum(contractElement*contractElement for contractElement in masterDataVector) ** 0.5
componentDataLength = sum(questionElement*questionElement for questionElement in componentDataVector) ** 0.5
dotProduct = sum(contractElement*questionElement for contractElement,questionElement in zip(masterDataVector, componentDataVector))
cosine = int((dotProduct / (masterDataLength * componentDataLength))*100)
return cosine
masterData = pd.read_csv('C:\\Similarity\\MasterData.csv', skipinitialspace=True)
componentData = pd.read_csv('C:\\Similarity\\ComponentData.csv', skipinitialspace=True)
for value in masterData['Sentences']:
eachMasterData = fetchLemmantizedWords()
for value in componentData['Sentences']:
eachComponentData = fetchLemmantizedWords()
cosineSimilarity = fetchCosine(eachMasterData,eachComponentData)
cosineSimilarityList.append(cosineSimilarity)
for value in cosineSimilarityList:
componentData = componentData.append(pd.DataFrame(cosineSimilarityList, columns=['Cosine Similarity']), ignore_index=True)
#componentData['Cosine Similarity'] = value
expected output after converting the df to CSV,
Facing issues while appending the values to the Data-frame, Please assist me with an approach for this. Thanks.
Here's what I came up with:
Sample set up
csv_master_data = \
"""
SI.No;Sentences
1;Emma is writing a letter.
2;We wake up early in the morning.
"""
csv_component_data = \
"""
SI.No;Sentences
1;Emma is writing a letter.
2;We wake up early in the morning.
3;Did Emma Write a letter?
4;We sleep early at night.
5;Emma wrote a letter.
"""
import pandas as pd
from io import StringIO
df_md = pd.read_csv(StringIO(csv_master_data), delimiter=';')
df_cd = pd.read_csv(StringIO(csv_component_data), delimiter=';')
We end up with 2 dataframes (showing df_cd):
SI.No
Sentences
0
1
Emma is writing a letter.
1
2
We wake up early in the morning.
2
3
Did Emma Write a letter?
3
4
We sleep early at night.
4
5
Emma wrote a letter.
I replaced the 2 functions you used by the following dummy functions:
import random
def fetchLemmantizedWords(words):
return [random.randint(1,30) for x in words]
def fetchCosine(lem_md, lem_cd):
return 100 if len(lem_md) == len(lem_cd) else random.randint(0,100)
Processing data
First, we apply the fetchLemmantizedWords function on each dataframe. The regex replace, lowercase and split of the sentences is done by Pandas instead of doing them in the function itself.
By making the sentence lowercase first, we can simplify the regex to only consider lowercase letters.
for df in (df_md, df_cd):
df['lem'] = df.apply(lambda x: fetchLemmantizedWords(x.Sentences
.lower()
.replace(r'[^a-z]', ' ')
.split()),
result_type='reduce',
axis=1)
Result for df_cd:
SI.No
Sentences
lem
0
1
Emma is writing a letter.
[29, 5, 4, 9, 28]
1
2
We wake up early in the morning.
[16, 8, 21, 14, 13, 4, 6]
2
3
Did Emma Write a letter?
[30, 9, 23, 16, 5]
3
4
We sleep early at night.
[8, 25, 24, 7, 3]
4
5
Emma wrote a letter.
[30, 30, 15, 7]
Next, we use a cross-join to make a dataframe with all possible combinations of md and cd data.
df_merged = pd.merge(df_md[['SI.No', 'lem']],
df_cd[['SI.No', 'lem']],
how='cross',
suffixes=('_md','_cd')
)
df_merged contents:
SI.No_md
lem_md
SI.No_cd
lem_cd
0
1
[14, 22, 9, 21, 4]
1
[3, 4, 8, 17, 2]
1
1
[14, 22, 9, 21, 4]
2
[29, 3, 10, 2, 19, 18, 21]
2
1
[14, 22, 9, 21, 4]
3
[20, 22, 29, 4, 3]
3
1
[14, 22, 9, 21, 4]
4
[17, 7, 1, 27, 19]
4
1
[14, 22, 9, 21, 4]
5
[17, 5, 3, 29]
5
2
[12, 30, 10, 11, 7, 11, 8]
1
[3, 4, 8, 17, 2]
6
2
[12, 30, 10, 11, 7, 11, 8]
2
[29, 3, 10, 2, 19, 18, 21]
7
2
[12, 30, 10, 11, 7, 11, 8]
3
[20, 22, 29, 4, 3]
8
2
[12, 30, 10, 11, 7, 11, 8]
4
[17, 7, 1, 27, 19]
9
2
[12, 30, 10, 11, 7, 11, 8]
5
[17, 5, 3, 29]
Next, we calculate the cosine value:
df_merged['cosine'] = df_merged.apply(lambda x: fetchCosine(x.lem_md,
x.lem_cd),
axis=1)
In the last step, we pivot the data and merge the original df_cd with the calculated results :
pd.merge(df_cd.drop(columns='lem').set_index('SI.No'),
df_merged.pivot_table(index='SI.No_cd',
columns='SI.No_md').droplevel(0, axis=1),
how='inner',
left_index=True,
right_index=True)
Result (again, these are dummy calculations):
SI.No
Sentences
1
2
1
Emma is writing a letter.
100
64
2
We wake up early in the morning.
63
100
3
Did Emma Write a letter?
100
5
4
We sleep early at night.
100
17
5
Emma wrote a letter.
35
9
Given two lists with the same length:
a = [15, 10, 15, 14]
b = [14, 15, 14, 14]
What is the minimal number of changes needed for 'a' to become 'b'?
This is my attempt at it:
c3=0
fht2= [15, 10, 15, 14]
sht2= [14, 15, 14, 14]
for i in range(0,len(fht2)):
for j in range(0,len(sht2)):
if fht2==sht2:
c=0
else:
if fht2!=sht2:
if fht2[i]!=sht2[j] and fht2[i]<=sht2[j]:
fht2[i]+=abs(fht2[i]-sht2[j])
c3+=1
if fht2[i]!=sht2[j] and fht2[i]>=sht2[j]:
fht2[i]-=abs(fht2[i]-sht2[j])
c3+=1
print(fht2)
print(c3)
The output should be:
[14, 15, 14, 14]
3
However, my output is :
[14, 14, 14, 14]
11
Please assist.
a = [15, 10, 15, 14]
b = [14, 15, 14, 14]
list_length = len(a)
print("list length:",list_length)
counter = 0
for i in range(list_length):
if a[i]!= b[i]:
counter = counter + 1
print ("minimal number of changes is:",counter)
Output:
list length:4
minimal number of changes is: 3
Please learn to write a better question
I have an array that I want to sum specific elements while iterating through it. I struggle to find a way to do this with loop.
The array shape is (25,25)
array
[ 92843, 86851, 91950, 98232, 83329, 94591, 88962, 97020,
107113, 98452, 103242, 106442, 123032, 119063, 112971, 114715,
108654, 114856, 109872, 124583, 120518, 112815, 120780, 127831,
147174],
[132633, 124073, 131357, 140331, 119041, 135131, 127089, 138601,
153019, 140647, 147489, 152061, 175761, 170090, 161388, 163879,
155221, 164080, 156960, 177976, 172169, 161165, 172544, 182617,
210249],
[159159, 148887, 157629, 168397, 142849, 162157, 152507, 166321,
183623, 168776, 176986, 182473, 210913, 204108, 193665, 196655,
186265, 196896, 188352, 213571, 206602, 193398, 207052, 219140,
252298]
I want to print out results like below for each iteration
print(array[23][0]+array[23][1]) # 159159 + 148887 = 308046
print(array[22][0]+array[22][1]+array[22][2]) #132633 + 124073 + 131357 = 388063
print(array[21][0]+array[21][1]+array[21][2]+array[21][3]) # 92843 + 86851 + 91950 + 98232 = 369876
Presenting each element as array[i][j], as you can see in each iteration i - 1, and the "length" of j increased one.
Is there anyway I can use loop to do this task ? Thanks!
Try this:
for i, sub in enumerate(reversed(array)):
print(sum(sub[:i]))
For example, if
array = [[ 1, 2, 3, 4, 5],
[ 6, 7, 8, 9, 10],
[11, 12, 13, 14, 15],
[16, 17, 18, 19, 20],
[21, 22, 23, 24, 25]]
the output would be
0 # last row, no elements summed
16 # 16 = 16
23 # 11 + 12 = 23
21 # 6 + 7 + 8 = 21
10 # 1 + 2 + 3 + 4 = 10
You may simply want the np.tril, followed by a np.sum(_, axis=0). This will give the sum of each row of the lower triangle of the matrix. Easily altered to give the upper triangle, if that's what you need.
print(np.sum(np.tril(array), axis=0))
In [661]: arr = np.arange(1,17).reshape(4,4)
In [662]: arr
Out[662]:
array([[ 1, 2, 3, 4],
[ 5, 6, 7, 8],
[ 9, 10, 11, 12],
[13, 14, 15, 16]])
In [666]: for i in range(3,-1,-1):
...: c = arr[i,:4-i]
...: print(c.sum(), c)
...:
13 [13]
19 [ 9 10]
18 [5 6 7]
10 [1 2 3 4]
How do I count the number of times the same integer occurs?
My code so far:
def searchAlgorithm (target, array):
i = 0 #iterating through elements of target list
q = 0 #iterating through lists sublists via indexes
while q < 4:
x = 0 #counting number of matches
for i in target:
if i in array[q]:
x += 1
else:
x == 0
print(x)
q += 1
a = [8, 12, 14, 26, 27, 28]
b = [[4, 12, 17, 26, 30, 45], [8, 12, 19, 24, 33, 47], [3, 10, 14, 31, 39, 41], [4, 12, 14, 26, 30, 45]]
searchAlgorithm(a, b)
The output of this is:
2
2
1
3
What I want to achieve is counting the number of times '1', '2' '3' matches occurs.
I have tried:
v = 0
if searchAlgorithm(a, b) == 2:
v += 1
print(v)
But that results in 0
You can use intersection of sets to find elements that are common in both lists. Then you can get the length of the sets. Here is how it looks:
num_common_elements = (len(set(a).intersection(i)) for i in b)
You can then iterate over the generator num_common_elements to use the values. Or you can cast it to a list to see the results:
print(list(num_common_elements))
[Out]: [2, 2, 1, 3]
If you want to implement the intersection functionality yourself, you can use the sum method to implement your own version. This is equivalent to doing len(set(x).intersection(set(y))
sum(i in y for i in x)
This works because it generates values such as [True, False, False, True, True] representing where the values in the first list are present in the second list. The sum method then treats the Trues as 1s and Falses as 0s, thus giving you the size of the intersection set
This is based on what I understand from your question. Probably you are looking for this:
from collections import Counter
def searchAlgorithm (target, array):
i = 0 #iterating through elements of target list
q = 0 #iterating through lists sublists via indexes
lst = []
while q < 4:
x = 0 #counting number of matches
for i in target:
if i in array[q]:
x += 1
else:
x == 0
lst.append(x)
q += 1
print(Counter(lst))
a = [8, 12, 14, 26, 27, 28]
b = [[4, 12, 17, 26, 30, 45], [8, 12, 19, 24, 33, 47], [3, 10, 14, 31, 39, 41], [4, 12, 14, 26, 30, 45]]
searchAlgorithm(a, b)
# Counter({2: 2, 1: 1, 3: 1})
Thanks to some for their helpful feedback, I have since come up a more simplified solution that does exactly what I want.
By storing the results of the matches in a list, I can then return the list out of the searchAlgorithm function and simple use .count() to count all the matches of a specific number within the list.
def searchAlgorithm (target, array):
i = 0
q = 0
results = []
while q < 4:
x = 0 #counting number of matches
for i in target:
if i in array[q]:
x += 1
else:
x == 0
results.append(x)
q += 1
return results
a = [8, 12, 14, 26, 27, 28]
b = [[4, 12, 17, 26, 30, 45], [8, 12, 19, 24, 33, 47], [3, 10, 14, 31, 39, 41], [4, 12, 14, 26, 30, 45]]
searchAlgorithm(a, b)
d2 = (searchAlgorithm(winNum, lotto).count(2))
I have the dataframe df and list members like this:
df = pd.DataFrame({'a':[[20,21],[22,19],[30,27]], 'b':[22,13,7]})
members = [[20,21],[18,21],[15,18]]
I want to select the subset df1 from df to let value of column 'a' in list members.
In the given case, I want to get the output like this:
df1 = pd.DataFrame({'a':[[20,21]], 'b':[22]})
Use isin
In [523]: df[df['a'].isin(members)]
Out[523]:
a b
0 [20, 21] 22
Or, query
In [530]: df.query('a in #members')
Out[530]:
a b
0 [20, 21] 22
Or, apply in
In [524]: df[df['a'].apply(lambda x: x in members)]
Out[524]:
a b
0 [20, 21] 22
Or, list comprehension
In [536]: df[[x in members for x in df['a']]]
Out[536]:
a b
0 [20, 21] 22
Details
In [525]: df
Out[525]:
a b
0 [20, 21] 22
1 [22, 19] 13
2 [30, 27] 7
In [526]: members
Out[526]: [[20, 21], [18, 21], [15, 18]]
In [527]: pd.__version__
Out[527]: '0.23.0.dev0+60.ge09189e'
One possible way is convert values to tuples and filter by boolean indexing with isin:
df = df[df['a'].apply(tuple).isin([tuple(x) for x in members])]
print (df)
a b
0 [20, 21] 22