Explode data in a list not separated by comma - python

Let's say have the following database:
{'docdb_family_id': {0: 569328,
1: 574660,
2: 1187498,
3: 1226468,
4: 1236571,
5: 1239098,
6: 1239277,
7: 1239483,
8: 1239622,
9: 1239624,
10: 1239749,
11: 1334477,
12: 1340405,
13: 1340418,
14: 1340462,
15: 1340471,
16: 1340485,
17: 1340488,
18: 1340508,
19: 1340519,
20: 1340541},
'newa_cited_docdb': {0: '[ 596005 4321416 5802640 6031690 6043910 8600475 8642629 9203255 9345445 10177065 10455451 13428248 22139349 22591458 24627241 24750476 26261826 26405611 27079105 27096884]',
1: '[ 5956195 11260528 22181831 22437920 22642946 23278096 23407037 23458128 24244657 24355363 25014714 25115774 25156886 27047688 27089078 27398716]',
2: '[ 5855196 7755392 11183886 22894980 24648618 27185399]',
3: '[ 3573464 6279285 6294985 6542463 6981930 7427770 10325811 14970234 16878329 17935009 21811002 22329817 23543436 23907898 24456108 25283772]',
4: '[ 2777078 2826073 5944733 10484188 11052747 14682645 15688752 22333410 22614097 22646501 22783765 22978728 23231683 24259740 24605606 24839432 25492752 27009992 27044704]',
5: '[ 5777407 10417156 23463145 23845079 24397163 24426379 24916732 25216234 25296619 27054560 27509152]',
6: '[ 4136523 12578497 21994155 22418792 22626616 22655464 22694825 22779403 23081767 23309829 23379411 23621952 24130698 24236071 24267003 24790872 24841797 25343500 27006578]',
7: '[21722194 23841261 23870348 24749080 26713455 26884023 26892256 27123571]',
8: '[ 3770167 9249538 20340153 21805004 21826650 23074051 23211424 23586695 23664858 24139881 24669345 24951262 25109266 25172355 25351735 26158421 27074633]',
9: '[ 3773931 10400885 23825854 24863945 24904226 25372210 26673422 27108903]',
10: '[ 6245732 6270984 6282047 6313094 6323632 6357314 12700997 14934415]',
11: '[1331950 5937719 5950928 6032897 6737094 8103287]',
12: '[22536768 23111794 23827356 24148953 24483064 24636228 26369896 26722884]',
13: '[ 4096597 6452385 9164095 19820980 22468583 23758517 24922228]',
14: '[ 6273193 6365448 9349940 10531948 13589721 20897840 21818345 22422049 23234586 23722349 24282964 24466601 25476838 26223504 26685774 26756449 26812104 26900843 27088150]',
15: '[ 3770297 6285357 21272262 21883292 22392025 23100861 23160290 23827496 24060758 25448672 26918320]',
16: '[21808322 25167492 25401922 26858065]',
17: '[ 6293130 12621423 12977043 14043576 14524083 22013480 23070753 23360636 23672818 24210016 24396413 24505095 25447453 26335550 27560125]',
18: '[21923978 23414619 23700077 23916998 23917011 23917023 24227869]',
19: '[ 3029629 3461742 8589904 10338953 10633369 16254362 22248316 22635394 24392987 25416705 26671842 27391491 27406148]',
20: None},
'paperid': {0: nan,
1: nan,
2: nan,
3: nan,
4: nan,
5: nan,
6: nan,
7: nan,
8: nan,
9: nan,
10: nan,
11: nan,
12: nan,
13: nan,
14: nan,
15: nan,
16: nan,
17: nan,
18: nan,
19: nan,
20: 1998988989.0},
'fronteer': {0: 0,
1: 0,
2: 0,
3: 0,
4: 0,
5: 0,
6: 0,
7: 0,
8: 0,
9: 0,
10: 0,
11: 0,
12: 0,
13: 0,
14: 0,
15: 0,
16: 0,
17: 0,
18: 0,
19: 0,
20: 1},
'distance': {0: nan,
1: nan,
2: nan,
3: nan,
4: nan,
5: nan,
6: nan,
7: nan,
8: nan,
9: nan,
10: nan,
11: nan,
12: nan,
13: nan,
14: nan,
15: nan,
16: nan,
17: nan,
18: nan,
19: nan,
20: 0.0},
'cited_docdb_ls': {0: '[ 596005 4321416 5802640 6031690 6043910 8600475 8642629 9203255 9345445 10177065 10455451 13428248 22139349 22591458 24627241 24750476 26261826 26405611 27079105 27096884]',
1: '[ 5956195 11260528 22181831 22437920 22642946 23278096 23407037 23458128 24244657 24355363 25014714 25115774 25156886 27047688 27089078 27398716]',
2: '[ 5855196 7755392 11183886 22894980 24648618 27185399]',
3: '[ 3573464 6279285 6294985 6542463 6981930 7427770 10325811 14970234 16878329 17935009 21811002 22329817 23543436 23907898 24456108 25283772]',
4: '[ 2777078 2826073 5944733 10484188 11052747 14682645 15688752 22333410 22614097 22646501 22783765 22978728 23231683 24259740 24605606 24839432 25492752 27009992 27044704]',
5: '[ 5777407 10417156 23463145 23845079 24397163 24426379 24916732 25216234 25296619 27054560 27509152]',
6: '[ 4136523 12578497 21994155 22418792 22626616 22655464 22694825 22779403 23081767 23309829 23379411 23621952 24130698 24236071 24267003 24790872 24841797 25343500 27006578]',
7: '[21722194 23841261 23870348 24749080 26713455 26884023 26892256 27123571]',
8: '[ 3770167 9249538 20340153 21805004 21826650 23074051 23211424 23586695 23664858 24139881 24669345 24951262 25109266 25172355 25351735 26158421 27074633]',
9: '[ 3773931 10400885 23825854 24863945 24904226 25372210 26673422 27108903]',
10: '[ 6245732 6270984 6282047 6313094 6323632 6357314 12700997 14934415]',
11: '[1331950 5937719 5950928 6032897 6737094 8103287]',
12: '[22536768 23111794 23827356 24148953 24483064 24636228 26369896 26722884]',
13: '[ 4096597 6452385 9164095 19820980 22468583 23758517 24922228]',
14: '[ 6273193 6365448 9349940 10531948 13589721 20897840 21818345 22422049 23234586 23722349 24282964 24466601 25476838 26223504 26685774 26756449 26812104 26900843 27088150]',
15: '[ 3770297 6285357 21272262 21883292 22392025 23100861 23160290 23827496 24060758 25448672 26918320]',
16: '[21808322 25167492 25401922 26858065]',
17: '[ 6293130 12621423 12977043 14043576 14524083 22013480 23070753 23360636 23672818 24210016 24396413 24505095 25447453 26335550 27560125]',
18: '[21923978 23414619 23700077 23916998 23917011 23917023 24227869]',
19: '[ 3029629 3461742 8589904 10338953 10633369 16254362 22248316 22635394 24392987 25416705 26671842 27391491 27406148]',
20: []}}
what I would like to do is to explode the variable cited_docdb_ls which contains lists separated by space rather than a comma.
How can I do that? If it is not possible, is there a way to separate them by comma rather than space and then explode them?
The resulting database should either contain cited_docdb_ls with traditional lists separated by comma and not by spaces or the exploded database. I have checked the df.explode() documentation but couldd not find any hint on how to manage situations where the list is separated by space.
Thank you

I would use str.findall with a (\d+) regex for numbers to convert the strings to lists, then explode:
out = (df.assign(newa_cited_docdb=df['newa_cited_docdb'].str.findall('\d+'),
cited_docdb_ls=df['cited_docdb_ls'].str.findall('(\d+)'))
.explode(['newa_cited_docdb', 'cited_docdb_ls'])
)
NB. if you don't have only digits a (\w+) regex will be more generic, however if the strings also contain [/] other than the in first and last character (e.g. [abc 12]3 45d]), then #jezrael's anwser will be an alternative.
output:
docdb_family_id newa_cited_docdb paperid fronteer distance \
0 569328 596005 NaN 0 NaN
0 569328 4321416 NaN 0 NaN
0 569328 5802640 NaN 0 NaN
0 569328 6031690 NaN 0 NaN
0 569328 6043910 NaN 0 NaN
.. ... ... ... ... ...
19 1340519 25416705 NaN 0 NaN
19 1340519 26671842 NaN 0 NaN
19 1340519 27391491 NaN 0 NaN
19 1340519 27406148 NaN 0 NaN
20 1340541 None 1.998989e+09 1 0.0
cited_docdb_ls
0 596005
0 4321416
0 5802640
0 6031690
0 6043910
.. ...
19 25416705
19 26671842
19 27391491
19 27406148
20 NaN
[239 rows x 6 columns]

Use Series.str.strip with Series.str.split for both columns and then DataFrame.explode:
df = (df.assign(newa_cited_docdb=df['newa_cited_docdb'].str.strip('[]').str.split(),
cited_docdb_ls=df['cited_docdb_ls'].str.strip('[]').str.split())
.explode(['newa_cited_docdb','cited_docdb_ls']))

Related

How to get categories of words containing unique 3-letter set from the columns of pandas dataframe in Python?

I have a dataframe df which looks as
Unnamed: 0 Characters Split A B C D Set Names
0 FROKDUWJU [FRO, KDU, WJU] FRO KDU WJU NaN {WJU, KDU, FRO}
1 IDJWPZSUR [IDJ, WPZ, SUR] IDJ WPZ SUR NaN {SUR, WPZ, IDJ}
2 UCFURKIRODCQ [UCF, URK, IRO, DCQ] UCF URK IRO DCQ {UCF, URK, DCQ, IRO}
3 ORI [ORI] ORI NaN NaN NaN {ORI}
4 PROIRKIQARTIBPO [PRO, IRK, IQA, RTI, BPO] PRO IRK IQA RTI {IQA, BPO, PRO, IRK, RTI}
5 QAZWREDCQIBR [QAZ, WRE, DCQ, IBR] QAZ WRE DCQ IBR {DCQ, QAZ, IBR, WRE}
6 PLPRUFSWURKI [PLP, RUF, SWU, RKI] PLP RUF SWU RKI {PLP, SWU, RKI, RUF}
7 FROIEUSKIKIR [FRO, IEU, SKI, KIR] FRO IEU SKI KIR {SKI, IEU, KIR, FRO}
8 ORIUWJZSRFRO [ORI, UWJ, ZSR, FRO] ORI UWJ ZSR FRO {UWJ, ORI, ZSR, FRO}
9 URKIFJVUR [URK, IFJ, VUR] URK IFJ VUR NaN {URK, VUR, IFJ}
10 RUFOFR [RUF, OFR] RUF OFR NaN NaN {OFR, RUF}
11 IEU [IEU] IEU NaN NaN NaN {IEU}
12 PIMIEU [PIM, IEU] PIM IEU NaN NaN {PIM, IEU}
The first column contains certain names. The Characters Split column contains the name split into every 3 letters in the form of a list. Columns A, B, C, and D contain the breakdown of those 3-letters. Column Set Names have the same 3-letters but in the form of a set.
Some of the 3-letters are common in different names. For example: "FRO" is present in name in index 0, 7 and 8. For these names which have one 3-letter set in common, I'd like to put them into one category, perferably in the form of list. Is it possible to have these categories for each unique 3-letter set? What would be the suitable way to do it?
df.to_dict() is as shown:
{'Unnamed: 0': {0: 'FROKDUWJU',
1: 'IDJWPZSUR',
2: 'UCFURKIRODCQ',
3: 'ORI',
4: 'PROIRKIQARTIBPO',
5: 'QAZWREDCQIBR',
6: 'PLPRUFSWURKI',
7: 'FROIEUSKIKIR',
8: 'ORIUWJZSRFRO',
9: 'URKIFJVUR',
10: 'RUFOFR',
11: 'IEU',
12: 'PIMIEU'},
'Characters Split': {0: ['FRO', 'KDU', 'WJU'],
1: ['IDJ', 'WPZ', 'SUR'],
2: ['UCF', 'URK', 'IRO', 'DCQ'],
3: ['ORI'],
4: ['PRO', 'IRK', 'IQA', 'RTI', 'BPO'],
5: ['QAZ', 'WRE', 'DCQ', 'IBR'],
6: ['PLP', 'RUF', 'SWU', 'RKI'],
7: ['FRO', 'IEU', 'SKI', 'KIR'],
8: ['ORI', 'UWJ', 'ZSR', 'FRO'],
9: ['URK', 'IFJ', 'VUR'],
10: ['RUF', 'OFR'],
11: ['IEU'],
12: ['PIM', 'IEU']},
'A': {0: 'FRO',
1: 'IDJ',
2: 'UCF',
3: 'ORI',
4: 'PRO',
5: 'QAZ',
6: 'PLP',
7: 'FRO',
8: 'ORI',
9: 'URK',
10: 'RUF',
11: 'IEU',
12: 'PIM'},
'B': {0: 'KDU',
1: 'WPZ',
2: 'URK',
3: nan,
4: 'IRK',
5: 'WRE',
6: 'RUF',
7: 'IEU',
8: 'UWJ',
9: 'IFJ',
10: 'OFR',
11: nan,
12: 'IEU'},
'C': {0: 'WJU',
1: 'SUR',
2: 'IRO',
3: nan,
4: 'IQA',
5: 'DCQ',
6: 'SWU',
7: 'SKI',
8: 'ZSR',
9: 'VUR',
10: nan,
11: nan,
12: nan},
'D': {0: nan,
1: nan,
2: 'DCQ',
3: nan,
4: 'RTI',
5: 'IBR',
6: 'RKI',
7: 'KIR',
8: 'FRO',
9: nan,
10: nan,
11: nan,
12: nan},
'Set Names': {0: {'FRO', 'KDU', 'WJU'},
1: {'IDJ', 'SUR', 'WPZ'},
2: {'DCQ', 'IRO', 'UCF', 'URK'},
3: {'ORI'},
4: {'BPO', 'IQA', 'IRK', 'PRO', 'RTI'},
5: {'DCQ', 'IBR', 'QAZ', 'WRE'},
6: {'PLP', 'RKI', 'RUF', 'SWU'},
7: {'FRO', 'IEU', 'KIR', 'SKI'},
8: {'FRO', 'ORI', 'UWJ', 'ZSR'},
9: {'IFJ', 'URK', 'VUR'},
10: {'OFR', 'RUF'},
11: {'IEU'},
12: {'IEU', 'PIM'}}}
You can explode 'Set Names', then groupby the exploded columns and merge the 'Unnamed: 0' into a list per group:
(df.explode('Set Names')
.groupby('Set Names')
['Unnamed: 0'].apply(list)
)
output:
Set Names
BPO [PROIRKIQARTIBPO]
DCQ [UCFURKIRODCQ, QAZWREDCQIBR]
FRO [FROKDUWJU, FROIEUSKIKIR, ORIUWJZSRFRO]
IBR [QAZWREDCQIBR]
IDJ [IDJWPZSUR]
... ...
WJU [FROKDUWJU]
WPZ [IDJWPZSUR]
WRE [QAZWREDCQIBR]
ZSR [ORIUWJZSRFRO]
If you want to filter the output to have a minimal number of items per group (here > 1):
(df.explode('Set Names')
.groupby('Set Names')
['Unnamed: 0'].apply(lambda g: list(g) if len(g) > 1 else None)
.dropna()
)
output:
Set Names
DCQ [UCFURKIRODCQ, QAZWREDCQIBR]
FRO [FROKDUWJU, FROIEUSKIKIR, ORIUWJZSRFRO]
IEU [FROIEUSKIKIR, IEU, PIMIEU]
ORI [ORI, ORIUWJZSRFRO]
RUF [PLPRUFSWURKI, RUFOFR]
URK [UCFURKIRODCQ, URKIFJVUR]

Column Values still shown after .isin()

As requested, here is a minimal reproducable example that will generate the issue of .isin() not dropping the values not in .isin() but just setting them to zero:
import os
import pandas as pd
df_example = pd.DataFrame({'Requesting as': {0: 'Employee', 1: 'Ex- Employee', 2: 'Employee', 3: 'Employee', 4: 'Ex-Employee', 5: 'Employee', 6: 'Employee', 7: 'Employee', 8: 'Ex-Employee', 9: 'Ex-Employee', 10: 'Employee', 11: 'Employee', 12: 'Ex-Employee', 13: 'Ex-Employee', 14: 'Employee', 15: 'Employee', 16: 'Employee', 17: 'Ex-Employee', 18: 'Employee', 19: 'Employee', 20: 'Ex-Employee', 21: 'Employee', 22: 'Employee', 23: 'Ex-Employee', 24: 'Employee', 25: 'Employee', 26: 'Ex-Employee', 27: 'Employee', 28: 'Employee', 29: 'Ex-Employee', 30: 'Employee', 31: 'Employee', 32: 'Ex-Employee', 33: 'Employee', 34: 'Employee', 35: 'Ex-Employee', 36: 'Employee', 37: 'Employee', 38: 'Ex-Employee', 39: 'Employee', 40: 'Employee'}, 'Years of service': {0: -0.4, 1: -0.3, 2: -0.2, 3: 1.0, 4: 1.0, 5: 1.0, 6: 2.0, 7: 2.0, 8: 2.0, 9: 2.0, 10: 3.0, 11: 3.0, 12: 3.0, 13: 4.0, 14: 4.0, 15: 4.0, 16: 5.0, 17: 5.0, 18: 5.0, 19: 5.0, 20: 6.0, 21: 6.0, 22: 6.0, 23: 11.0, 24: 11.0, 25: 11.0, 26: 16.0, 27: 17.0, 28: 18.0, 29: 21.0, 30: 22.0, 31: 23.0, 32: 26.0, 33: 27.0, 34: 28.0, 35: 31.0, 36: 32.0, 37: 33.0, 38: 35.0, 39: 36.0, 40: 37.0}, 'yos_bins': {0: 0, 1: 0, 2: 0, 3: '0-1', 4: '0-1', 5: '0-1', 6: '1-2', 7: '1-2', 8: '1-2', 9: '1-2', 10: '2-3', 11: '2-3', 12: '2-3', 13: '3-4', 14: '3-4', 15: '3-4', 16: '4-5', 17: '4-5', 18: '4-5', 19: '4-5', 20: '5-6', 21: '5-6', 22: '5-6', 23: '10-15', 24: '10-15', 25: '10-15', 26: '15-20', 27: '15-20', 28: '15-20', 29: '20-40', 30: '20-40', 31: '20-40', 32: '20-40', 33: '20-40', 34: '20-40', 35: '20-40', 36: '20-40', 37: '20-40', 38: '20-40', 39: '20-40', 40: '20-40'}})
cut_labels = ['0-1','1-2', '2-3', '3-4', '4-5', '5-6', '6-10', '10-15', '15-20', '20-40']
cut_bins = (0, 1, 2, 3, 4, 5, 6, 10, 15, 20, 40)
df_example['yos_bins'] = pd.cut(df_example['Years of service'], bins=cut_bins, labels=cut_labels)
print(df_example['yos_bins'].value_counts())
print(len(df_example['yos_bins']))
print(len(df_example))
print(df_example['yos_bins'].value_counts())
test = df_example[df_example['yos_bins'].isin(['0-1', '1-2', '2-3'])]
print('test dataframe:\n',test)
print('\n')
print('test value counts of yos_bins:\n', test['yos_bins'].value_counts())
print('\n')
dic_test = test.to_dict()
print(dic_test)
print('\n')
print(test.value_counts())ervr
I have created bins for a column with "years of service":
cut_labels = ['0-1','1-2', '2-3', '3-4', '4-5', '5-6', '6-10', '10-15', '15-20', '20-40']
cut_bins = (0, 1, 2, 3, 4, 5, 6, 10, 15, 20, 40)
df['yos_bins'] = pd.cut(df['Years of service'], bins=cut_bins, labels=cut_labels)
Then I applied .isin() to the dataframe column called 'yos_bins' with the intention to filter for a selection of column values. Excerpt from column in df.
The column I use to slice is called 'yos_bins' (i.e. binned Years of Service). I want to select only 3 ranges (0-1, 1-2, 2-3 years), but apparently there are more ranges included in the column.
To my surprise, when I apply value_counts(), I still get all values of the yos_bins column from the df dataframe (but with 0 counts).
test.yos_bins.value_counts()
Looks like this:
This was not intended, all other bins except the 3 in isin() should have been dropped. The resulting issue is that the 0 values are shown in sns.countplots, so I end up with undesired columns with zero counts.
When I save the df to_excel(), all "10-15" value fields show a "Text Date with 2-Digit Year" error. I do not load that dataframe back into python, so not sure if this could cause the problem?
Does anybody know how I can create the test dataframe that merely consists of the 3 yos_bins values instead of showing all yos_bins values, but some with zeros?
An ugly solution because numpy and pandas are misfeatured in terms of element-wise "is in". In my experience I do the comparison manually with numpy arrays.
yos_bins = np.array(df["yos_bins"])
yos_bins_sel = np.array(["0-1", "1-2", "2-3"])
mask = (yos_bins[:, None] == yos_bins_sel[None, :]).any(1)
df[mask]
Requesting as Years of service yos_bins
3 Employee 1.0 0-1
4 Ex-Employee 1.0 0-1
5 Employee 1.0 0-1
6 Employee 2.0 1-2
7 Employee 2.0 1-2
8 Ex-Employee 2.0 1-2
9 Ex-Employee 2.0 1-2
10 Employee 3.0 2-3
11 Employee 3.0 2-3
12 Ex-Employee 3.0 2-3
Explanation
(using x as yos_bins and y as yos_bins_sel)
x[:, None] == y[None, :]).all(1) is the main takeaway, x[:, None] converts x from shape to (n,) to (n, 1). y[None, :] converts y from shape (m,) to (1, m). Comparing them with == forms a broadcasted element-wise boolean array of shape (n, m), we want our array to be (n,)-shaped, so we apply .any(1) so that the second dimension is compressed to True if at least one of it's booleans is True (which is if the element is in the yos_bins_sel array). You end up with a boolean array which can be used to mask the original Data Frame. Replace x with the array containing the values to be compared and y with the array that the values of x should be contained in and you will be able to do this for any data set.

'forward replace' with pandas

I have a df where every row looks something like this:
series = pd.Series({0: 1.0,
1: 1.0,
10: nan,
11: nan,
2: 1.0,
3: 1.0,
4: nan,
5: nan,
6: nan,
7: nan,
8: nan,
9: nan,
'B': 3.0,
'D': 1.0})
For every value in D I want series[series.D] to be 2 and, all numerical values that are higher than series.D, also to be 2. It's kinda like a 'forward replace'.
So what I want is:
target = pd.Series({0: 1.0,
1: 2.0,
10: nan,
11: nan,
2: 2.0,
3: 2.0,
4: nan,
5: nan,
6: nan,
7: nan,
8: nan,
9: nan,
'B': 3.0,
'D': 1.0})
So far I've got:
def forward_replace(series):
if pd.notnull(series['D']):
cols = [0,1,2,3,4,5,6,7,8,9,10,11]
target_cols = [x for x in cols if x > series.D]
series.loc[target_cols].replace({1:2}, inplace=True)
return series
It seems like it's not possible to use label based indexing with numerical column labels?

Pandas groupby and count percentages while keeping the NaNs

I have a df with order ids and channels.
all_data_filtered = all_data[["order_id", "user_channel"]]
all_data_filtered.head(100).to_dict()
gives this as a dictionary for someone to be able to replicate it easily with pd.DataFrame(dict_bellow):
{'order_id': {0: nan,
1: nan,
2: nan,
3: nan,
4: nan,
5: nan,
6: nan,
7: nan,
8: nan,
9: nan,
10: nan,
11: nan,
12: nan,
13: 7026578.0,
14: 7026584.0,
15: 7026890.0,
16: 7026959.0,
17: 7027028.0,
18: 7027109.0,
19: 7027175.0,
20: 7027193.0,
21: nan,
22: nan,
23: nan,
24: nan,
25: nan,
26: nan,
27: nan,
28: nan,
29: nan,
30: nan,
31: 7026635.0,
32: 7026842.0,
33: nan,
34: nan,
35: nan,
36: nan,
37: nan,
38: nan,
39: nan,
40: nan,
41: nan,
42: nan,
43: nan,
44: nan,
45: 7026923.0,
46: nan,
47: nan,
48: nan,
49: nan,
50: nan,
51: nan,
52: nan,
53: nan,
54: nan,
55: nan,
56: 7026662.0,
57: 7026677.0,
58: nan,
59: nan,
60: 7722035.0,
61: nan,
62: 7026686.0,
63: 7026695.0,
64: nan,
65: 7028045.0,
66: 7026698.0,
67: 7026701.0,
68: 7026740.0,
69: 7026743.0,
70: 7033025.0,
71: nan,
72: nan,
73: nan,
74: nan,
75: 7584236.0,
76: 7584395.0,
77: nan,
78: nan,
79: nan,
80: nan,
81: nan,
82: nan,
83: nan,
84: nan,
85: 7026761.0,
86: 7027055.0,
87: 7026755.0,
88: 7026758.0,
89: 7027709.0,
90: nan,
91: nan,
92: nan,
93: nan,
94: nan,
95: 7026764.0,
96: 7026824.0,
97: 7033052.0,
98: 7033112.0,
99: 7033349.0},
'user_channel': {0: 1,
1: 2,
2: 3,
3: 1,
4: 3,
5: 3,
6: 1,
7: 3,
8: 4,
9: 1,
10: 4,
11: 2,
12: 3,
13: 3,
14: 3,
15: 3,
16: 3,
17: 3,
18: 3,
19: 3,
20: 3,
21: 3,
22: 1,
23: 1,
24: 3,
25: 1,
26: 1,
27: 1,
28: 1,
29: 3,
30: 1,
31: 3,
32: 3,
33: 3,
34: 1,
35: 3,
36: 3,
37: 2,
38: 2,
39: 3,
40: 3,
41: 1,
42: 1,
43: 2,
44: 5,
45: 5,
46: 3,
47: 2,
48: 3,
49: 3,
50: 3,
51: 1,
52: 1,
53: 3,
54: 3,
55: 3,
56: 3,
57: 3,
58: 1,
59: 1,
60: 6,
61: 3,
62: 3,
63: 3,
64: 3,
65: 3,
66: 4,
67: 4,
68: 4,
69: 4,
70: 4,
71: 3,
72: 3,
73: 1,
74: 1,
75: 3,
76: 3,
77: 3,
78: 3,
79: 3,
80: 1,
81: 3,
82: 3,
83: 4,
84: 3,
85: 3,
86: 3,
87: 4,
88: 4,
89: 4,
90: 1,
91: 3,
92: 3,
93: 5,
94: 3,
95: 3,
96: 3,
97: 3,
98: 3,
99: 3}}
Those are individual orders. So I tried to group them and count them:
all_data_filtered.groupby(["user_channel"]).agg(['count'])
which gives this and is useful because I know through channel 3 came the most orders:
order_id
count
user_channel
1 391
2 211
3 1805
4 425
5 102
6 124
7 159
but these excludes all NaN (users that came through a channel and submitted no order). So that in the end I could get some percentages: through channel 3 came 90% of the users and 10% of them placed an order which makes for 40% of all orders. How to include the NaN so that I can calculate percentages?
Try using .agg('size').
df.groupby('user_channel').agg('size')
user_channel
1 22
2 6
3 57
4 11
5 3
6 1
dtype: int64
Versus
df.groupby('user_channel').agg('count')
order_id
user_channel
1 0
2 0
3 24
4 8
5 1
6 1

Pandas Compare rows in Dataframe

I have following data frame (represented by dictionary below):
{'Name': {0: '204',
1: '110838',
2: '110999',
3: '110998',
4: '111155',
5: '111710',
6: '111157',
7: '111156',
8: '111144',
9: '118972',
10: '111289',
11: '111288',
12: '111145',
13: '121131',
14: '118990',
15: '110653',
16: '110693',
17: '110694',
18: '111577',
19: '111702',
20: '115424',
21: '115127',
22: '115178',
23: '111578',
24: '115409',
25: '115468',
26: '111711',
27: '115163',
28: '115149',
29: '115251'},
'Sequence_new': {0: 1.0,
1: 2.0,
2: 3.0,
3: 4.0,
4: 5.0,
5: 6.0,
6: 7.0,
7: 8.0,
8: 9.0,
9: 10.0,
10: 11.0,
11: 12.0,
12: nan,
13: 13.0,
14: 14.0,
15: 15.0,
16: 16.0,
17: 17.0,
18: 18.0,
19: 19.0,
20: 20.0,
21: 21.0,
22: 22.0,
23: 23.0,
24: 24.0,
25: 25.0,
26: 26.0,
27: 27.0,
28: 28.0,
29: 29.0},
'Sequence_old': {0: 1,
1: 2,
2: 3,
3: 4,
4: 5,
5: 6,
6: 7,
7: 8,
8: 9,
9: 10,
10: 11,
11: 12,
12: 13,
13: 14,
14: 15,
15: 16,
16: 17,
17: 18,
18: 19,
19: 20,
20: 21,
21: 22,
22: 23,
23: 24,
24: 25,
25: 26,
26: 27,
27: 28,
28: 29,
29: 30}}
I am trying to understand what changed between old and new sequences. If by Name Sequence_old = Sequence_new, nothing changed. If Sequence+_new is 'nan', Name removed. Can you please help implement this in pandas?
What tried till now without success:
for i in range(0, len(Merge)):
if Merge.iloc[i]['Sequence_x'] == Merge.iloc[i]['Sequence_y']:
Merge.iloc[i]['New'] = 'N'
else:
Merge.iloc[i]['New'] = 'Y'
Thank you
You can use double numpy.where with condition with isnull:
mask = df.Sequence_old == df.Sequence_new
df['New'] = np.where(df.Sequence_new.isnull(), 'Removed',
np.where(mask, 'N', 'Y'))
print (df)
Name Sequence_new Sequence_old New
0 204 1.0 1 N
1 110838 2.0 2 N
2 110999 3.0 3 N
3 110998 4.0 4 N
4 111155 5.0 5 N
5 111710 6.0 6 N
6 111157 7.0 7 N
7 111156 8.0 8 N
8 111144 9.0 9 N
9 118972 10.0 10 N
10 111289 11.0 11 N
11 111288 12.0 12 N
12 111145 NaN 13 Removed
13 121131 13.0 14 Y
14 118990 14.0 15 Y
15 110653 15.0 16 Y
16 110693 16.0 17 Y
17 110694 17.0 18 Y
18 111577 18.0 19 Y
19 111702 19.0 20 Y
20 115424 20.0 21 Y
21 115127 21.0 22 Y
22 115178 22.0 23 Y
23 111578 23.0 24 Y
24 115409 24.0 25 Y
25 115468 25.0 26 Y
26 111711 26.0 27 Y
27 115163 27.0 28 Y
28 115149 28.0 29 Y
29 115251 29.0 30 Y
dic_new = {0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0, 4: 5.0, 5: 6.0, 6: 7.0, 7: 8.0, 8: 9.0, 9: 10.0, 10: 11.0, 11: 12.0,
12: 'Nan', 13: 13.0, 14: 14.0, 15: 15.0, 16: 16.0, 17: 17.0, 18: 18.0, 19: 19.0, 20: 20.0, 21: 21.0,
22: 22.0, 23: 23.0, 24: 24.0, 25: 25.0, 26: 26.0, 27: 27.0, 28: 28.0, 29: 29.0}
dic_old = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16,
16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29,
29: 30}
# Does the same thing as the code below
for a, b in zip(dic_new.items(), dic_old.items()):
if b[1].lower() != 'nan':
# You can add whatever print statement you want here
print(a[1] == b[1])
# Does the same thing as the code above
[print(a[1] == b[1]) for a, b in zip(dic_new.items(), dic_old.items()) if b[1].lower() != 'nan']

Categories

Resources