Explode data in a list not separated by comma

Explode data in a list not separated by comma - python

Let's say have the following database:
{'docdb_family_id': {0: 569328,
1: 574660,
2: 1187498,
3: 1226468,
4: 1236571,
5: 1239098,
6: 1239277,
7: 1239483,
8: 1239622,
9: 1239624,
10: 1239749,
11: 1334477,
12: 1340405,
13: 1340418,
14: 1340462,
15: 1340471,
16: 1340485,
17: 1340488,
18: 1340508,
19: 1340519,
20: 1340541},
'newa_cited_docdb': {0: '[ 596005 4321416 5802640 6031690 6043910 8600475 8642629 9203255 9345445 10177065 10455451 13428248 22139349 22591458 24627241 24750476 26261826 26405611 27079105 27096884]',
1: '[ 5956195 11260528 22181831 22437920 22642946 23278096 23407037 23458128 24244657 24355363 25014714 25115774 25156886 27047688 27089078 27398716]',
2: '[ 5855196 7755392 11183886 22894980 24648618 27185399]',
3: '[ 3573464 6279285 6294985 6542463 6981930 7427770 10325811 14970234 16878329 17935009 21811002 22329817 23543436 23907898 24456108 25283772]',
4: '[ 2777078 2826073 5944733 10484188 11052747 14682645 15688752 22333410 22614097 22646501 22783765 22978728 23231683 24259740 24605606 24839432 25492752 27009992 27044704]',
5: '[ 5777407 10417156 23463145 23845079 24397163 24426379 24916732 25216234 25296619 27054560 27509152]',
6: '[ 4136523 12578497 21994155 22418792 22626616 22655464 22694825 22779403 23081767 23309829 23379411 23621952 24130698 24236071 24267003 24790872 24841797 25343500 27006578]',
7: '[21722194 23841261 23870348 24749080 26713455 26884023 26892256 27123571]',
8: '[ 3770167 9249538 20340153 21805004 21826650 23074051 23211424 23586695 23664858 24139881 24669345 24951262 25109266 25172355 25351735 26158421 27074633]',
9: '[ 3773931 10400885 23825854 24863945 24904226 25372210 26673422 27108903]',
10: '[ 6245732 6270984 6282047 6313094 6323632 6357314 12700997 14934415]',
11: '[1331950 5937719 5950928 6032897 6737094 8103287]',
12: '[22536768 23111794 23827356 24148953 24483064 24636228 26369896 26722884]',
13: '[ 4096597 6452385 9164095 19820980 22468583 23758517 24922228]',
14: '[ 6273193 6365448 9349940 10531948 13589721 20897840 21818345 22422049 23234586 23722349 24282964 24466601 25476838 26223504 26685774 26756449 26812104 26900843 27088150]',
15: '[ 3770297 6285357 21272262 21883292 22392025 23100861 23160290 23827496 24060758 25448672 26918320]',
16: '[21808322 25167492 25401922 26858065]',
17: '[ 6293130 12621423 12977043 14043576 14524083 22013480 23070753 23360636 23672818 24210016 24396413 24505095 25447453 26335550 27560125]',
18: '[21923978 23414619 23700077 23916998 23917011 23917023 24227869]',
19: '[ 3029629 3461742 8589904 10338953 10633369 16254362 22248316 22635394 24392987 25416705 26671842 27391491 27406148]',
20: None},
'paperid': {0: nan,
1: nan,
2: nan,
3: nan,
4: nan,
5: nan,
6: nan,
7: nan,
8: nan,
9: nan,
10: nan,
11: nan,
12: nan,
13: nan,
14: nan,
15: nan,
16: nan,
17: nan,
18: nan,
19: nan,
20: 1998988989.0},
'fronteer': {0: 0,
1: 0,
2: 0,
3: 0,
4: 0,
5: 0,
6: 0,
7: 0,
8: 0,
9: 0,
10: 0,
11: 0,
12: 0,
13: 0,
14: 0,
15: 0,
16: 0,
17: 0,
18: 0,
19: 0,
20: 1},
'distance': {0: nan,
1: nan,
2: nan,
3: nan,
4: nan,
5: nan,
6: nan,
7: nan,
8: nan,
9: nan,
10: nan,
11: nan,
12: nan,
13: nan,
14: nan,
15: nan,
16: nan,
17: nan,
18: nan,
19: nan,
20: 0.0},
'cited_docdb_ls': {0: '[ 596005 4321416 5802640 6031690 6043910 8600475 8642629 9203255 9345445 10177065 10455451 13428248 22139349 22591458 24627241 24750476 26261826 26405611 27079105 27096884]',
1: '[ 5956195 11260528 22181831 22437920 22642946 23278096 23407037 23458128 24244657 24355363 25014714 25115774 25156886 27047688 27089078 27398716]',
2: '[ 5855196 7755392 11183886 22894980 24648618 27185399]',
3: '[ 3573464 6279285 6294985 6542463 6981930 7427770 10325811 14970234 16878329 17935009 21811002 22329817 23543436 23907898 24456108 25283772]',
4: '[ 2777078 2826073 5944733 10484188 11052747 14682645 15688752 22333410 22614097 22646501 22783765 22978728 23231683 24259740 24605606 24839432 25492752 27009992 27044704]',
5: '[ 5777407 10417156 23463145 23845079 24397163 24426379 24916732 25216234 25296619 27054560 27509152]',
6: '[ 4136523 12578497 21994155 22418792 22626616 22655464 22694825 22779403 23081767 23309829 23379411 23621952 24130698 24236071 24267003 24790872 24841797 25343500 27006578]',
7: '[21722194 23841261 23870348 24749080 26713455 26884023 26892256 27123571]',
8: '[ 3770167 9249538 20340153 21805004 21826650 23074051 23211424 23586695 23664858 24139881 24669345 24951262 25109266 25172355 25351735 26158421 27074633]',
9: '[ 3773931 10400885 23825854 24863945 24904226 25372210 26673422 27108903]',
10: '[ 6245732 6270984 6282047 6313094 6323632 6357314 12700997 14934415]',
11: '[1331950 5937719 5950928 6032897 6737094 8103287]',
12: '[22536768 23111794 23827356 24148953 24483064 24636228 26369896 26722884]',
13: '[ 4096597 6452385 9164095 19820980 22468583 23758517 24922228]',
14: '[ 6273193 6365448 9349940 10531948 13589721 20897840 21818345 22422049 23234586 23722349 24282964 24466601 25476838 26223504 26685774 26756449 26812104 26900843 27088150]',
15: '[ 3770297 6285357 21272262 21883292 22392025 23100861 23160290 23827496 24060758 25448672 26918320]',
16: '[21808322 25167492 25401922 26858065]',
17: '[ 6293130 12621423 12977043 14043576 14524083 22013480 23070753 23360636 23672818 24210016 24396413 24505095 25447453 26335550 27560125]',
18: '[21923978 23414619 23700077 23916998 23917011 23917023 24227869]',
19: '[ 3029629 3461742 8589904 10338953 10633369 16254362 22248316 22635394 24392987 25416705 26671842 27391491 27406148]',
20: []}}
what I would like to do is to explode the variable cited_docdb_ls which contains lists separated by space rather than a comma.
How can I do that? If it is not possible, is there a way to separate them by comma rather than space and then explode them?
The resulting database should either contain cited_docdb_ls with traditional lists separated by comma and not by spaces or the exploded database. I have checked the df.explode() documentation but couldd not find any hint on how to manage situations where the list is separated by space.
Thank you

I would use str.findall with a (\d+) regex for numbers to convert the strings to lists, then explode:
out = (df.assign(newa_cited_docdb=df['newa_cited_docdb'].str.findall('\d+'),
cited_docdb_ls=df['cited_docdb_ls'].str.findall('(\d+)'))
.explode(['newa_cited_docdb', 'cited_docdb_ls'])
)
NB. if you don't have only digits a (\w+) regex will be more generic, however if the strings also contain [/] other than the in first and last character (e.g. [abc 12]3 45d]), then #jezrael's anwser will be an alternative.
output:
docdb_family_id newa_cited_docdb paperid fronteer distance \
0 569328 596005 NaN 0 NaN
0 569328 4321416 NaN 0 NaN
0 569328 5802640 NaN 0 NaN
0 569328 6031690 NaN 0 NaN
0 569328 6043910 NaN 0 NaN
.. ... ... ... ... ...
19 1340519 25416705 NaN 0 NaN
19 1340519 26671842 NaN 0 NaN
19 1340519 27391491 NaN 0 NaN
19 1340519 27406148 NaN 0 NaN
20 1340541 None 1.998989e+09 1 0.0
cited_docdb_ls
0 596005
0 4321416
0 5802640
0 6031690
0 6043910
.. ...
19 25416705
19 26671842
19 27391491
19 27406148
20 NaN
[239 rows x 6 columns]

Use Series.str.strip with Series.str.split for both columns and then DataFrame.explode:
df = (df.assign(newa_cited_docdb=df['newa_cited_docdb'].str.strip('[]').str.split(),
cited_docdb_ls=df['cited_docdb_ls'].str.strip('[]').str.split())
.explode(['newa_cited_docdb','cited_docdb_ls']))

Related

How to get categories of words containing unique 3-letter set from the columns of pandas dataframe in Python?

I have a dataframe df which looks as
Unnamed: 0 Characters Split A B C D Set Names
0 FROKDUWJU [FRO, KDU, WJU] FRO KDU WJU NaN {WJU, KDU, FRO}
1 IDJWPZSUR [IDJ, WPZ, SUR] IDJ WPZ SUR NaN {SUR, WPZ, IDJ}
2 UCFURKIRODCQ [UCF, URK, IRO, DCQ] UCF URK IRO DCQ {UCF, URK, DCQ, IRO}
3 ORI [ORI] ORI NaN NaN NaN {ORI}
4 PROIRKIQARTIBPO [PRO, IRK, IQA, RTI, BPO] PRO IRK IQA RTI {IQA, BPO, PRO, IRK, RTI}
5 QAZWREDCQIBR [QAZ, WRE, DCQ, IBR] QAZ WRE DCQ IBR {DCQ, QAZ, IBR, WRE}
6 PLPRUFSWURKI [PLP, RUF, SWU, RKI] PLP RUF SWU RKI {PLP, SWU, RKI, RUF}
7 FROIEUSKIKIR [FRO, IEU, SKI, KIR] FRO IEU SKI KIR {SKI, IEU, KIR, FRO}
8 ORIUWJZSRFRO [ORI, UWJ, ZSR, FRO] ORI UWJ ZSR FRO {UWJ, ORI, ZSR, FRO}
9 URKIFJVUR [URK, IFJ, VUR] URK IFJ VUR NaN {URK, VUR, IFJ}
10 RUFOFR [RUF, OFR] RUF OFR NaN NaN {OFR, RUF}
11 IEU [IEU] IEU NaN NaN NaN {IEU}
12 PIMIEU [PIM, IEU] PIM IEU NaN NaN {PIM, IEU}
The first column contains certain names. The Characters Split column contains the name split into every 3 letters in the form of a list. Columns A, B, C, and D contain the breakdown of those 3-letters. Column Set Names have the same 3-letters but in the form of a set.
Some of the 3-letters are common in different names. For example: "FRO" is present in name in index 0, 7 and 8. For these names which have one 3-letter set in common, I'd like to put them into one category, perferably in the form of list. Is it possible to have these categories for each unique 3-letter set? What would be the suitable way to do it?
df.to_dict() is as shown:
{'Unnamed: 0': {0: 'FROKDUWJU',
1: 'IDJWPZSUR',
2: 'UCFURKIRODCQ',
3: 'ORI',
4: 'PROIRKIQARTIBPO',
5: 'QAZWREDCQIBR',
6: 'PLPRUFSWURKI',
7: 'FROIEUSKIKIR',
8: 'ORIUWJZSRFRO',
9: 'URKIFJVUR',
10: 'RUFOFR',
11: 'IEU',
12: 'PIMIEU'},
'Characters Split': {0: ['FRO', 'KDU', 'WJU'],
1: ['IDJ', 'WPZ', 'SUR'],
2: ['UCF', 'URK', 'IRO', 'DCQ'],
3: ['ORI'],
4: ['PRO', 'IRK', 'IQA', 'RTI', 'BPO'],
5: ['QAZ', 'WRE', 'DCQ', 'IBR'],
6: ['PLP', 'RUF', 'SWU', 'RKI'],
7: ['FRO', 'IEU', 'SKI', 'KIR'],
8: ['ORI', 'UWJ', 'ZSR', 'FRO'],
9: ['URK', 'IFJ', 'VUR'],
10: ['RUF', 'OFR'],
11: ['IEU'],
12: ['PIM', 'IEU']},
'A': {0: 'FRO',
1: 'IDJ',
2: 'UCF',
3: 'ORI',
4: 'PRO',
5: 'QAZ',
6: 'PLP',
7: 'FRO',
8: 'ORI',
9: 'URK',
10: 'RUF',
11: 'IEU',
12: 'PIM'},
'B': {0: 'KDU',
1: 'WPZ',
2: 'URK',
3: nan,
4: 'IRK',
5: 'WRE',
6: 'RUF',
7: 'IEU',
8: 'UWJ',
9: 'IFJ',
10: 'OFR',
11: nan,
12: 'IEU'},
'C': {0: 'WJU',
1: 'SUR',
2: 'IRO',
3: nan,
4: 'IQA',
5: 'DCQ',
6: 'SWU',
7: 'SKI',
8: 'ZSR',
9: 'VUR',
10: nan,
11: nan,
12: nan},
'D': {0: nan,
1: nan,
2: 'DCQ',
3: nan,
4: 'RTI',
5: 'IBR',
6: 'RKI',
7: 'KIR',
8: 'FRO',
9: nan,
10: nan,
11: nan,
12: nan},
'Set Names': {0: {'FRO', 'KDU', 'WJU'},
1: {'IDJ', 'SUR', 'WPZ'},
2: {'DCQ', 'IRO', 'UCF', 'URK'},
3: {'ORI'},
4: {'BPO', 'IQA', 'IRK', 'PRO', 'RTI'},
5: {'DCQ', 'IBR', 'QAZ', 'WRE'},
6: {'PLP', 'RKI', 'RUF', 'SWU'},
7: {'FRO', 'IEU', 'KIR', 'SKI'},
8: {'FRO', 'ORI', 'UWJ', 'ZSR'},
9: {'IFJ', 'URK', 'VUR'},
10: {'OFR', 'RUF'},
11: {'IEU'},
12: {'IEU', 'PIM'}}}

You can explode 'Set Names', then groupby the exploded columns and merge the 'Unnamed: 0' into a list per group:
(df.explode('Set Names')
.groupby('Set Names')
['Unnamed: 0'].apply(list)
)
output:
Set Names
BPO [PROIRKIQARTIBPO]
DCQ [UCFURKIRODCQ, QAZWREDCQIBR]
FRO [FROKDUWJU, FROIEUSKIKIR, ORIUWJZSRFRO]
IBR [QAZWREDCQIBR]
IDJ [IDJWPZSUR]
... ...
WJU [FROKDUWJU]
WPZ [IDJWPZSUR]
WRE [QAZWREDCQIBR]
ZSR [ORIUWJZSRFRO]
If you want to filter the output to have a minimal number of items per group (here > 1):
(df.explode('Set Names')
.groupby('Set Names')
['Unnamed: 0'].apply(lambda g: list(g) if len(g) > 1 else None)
.dropna()
)
output:
Set Names
DCQ [UCFURKIRODCQ, QAZWREDCQIBR]
FRO [FROKDUWJU, FROIEUSKIKIR, ORIUWJZSRFRO]
IEU [FROIEUSKIKIR, IEU, PIMIEU]
ORI [ORI, ORIUWJZSRFRO]
RUF [PLPRUFSWURKI, RUFOFR]
URK [UCFURKIRODCQ, URKIFJVUR]

Column Values still shown after .isin()

As requested, here is a minimal reproducable example that will generate the issue of .isin() not dropping the values not in .isin() but just setting them to zero:
import os
import pandas as pd
df_example = pd.DataFrame({'Requesting as': {0: 'Employee', 1: 'Ex- Employee', 2: 'Employee', 3: 'Employee', 4: 'Ex-Employee', 5: 'Employee', 6: 'Employee', 7: 'Employee', 8: 'Ex-Employee', 9: 'Ex-Employee', 10: 'Employee', 11: 'Employee', 12: 'Ex-Employee', 13: 'Ex-Employee', 14: 'Employee', 15: 'Employee', 16: 'Employee', 17: 'Ex-Employee', 18: 'Employee', 19: 'Employee', 20: 'Ex-Employee', 21: 'Employee', 22: 'Employee', 23: 'Ex-Employee', 24: 'Employee', 25: 'Employee', 26: 'Ex-Employee', 27: 'Employee', 28: 'Employee', 29: 'Ex-Employee', 30: 'Employee', 31: 'Employee', 32: 'Ex-Employee', 33: 'Employee', 34: 'Employee', 35: 'Ex-Employee', 36: 'Employee', 37: 'Employee', 38: 'Ex-Employee', 39: 'Employee', 40: 'Employee'}, 'Years of service': {0: -0.4, 1: -0.3, 2: -0.2, 3: 1.0, 4: 1.0, 5: 1.0, 6: 2.0, 7: 2.0, 8: 2.0, 9: 2.0, 10: 3.0, 11: 3.0, 12: 3.0, 13: 4.0, 14: 4.0, 15: 4.0, 16: 5.0, 17: 5.0, 18: 5.0, 19: 5.0, 20: 6.0, 21: 6.0, 22: 6.0, 23: 11.0, 24: 11.0, 25: 11.0, 26: 16.0, 27: 17.0, 28: 18.0, 29: 21.0, 30: 22.0, 31: 23.0, 32: 26.0, 33: 27.0, 34: 28.0, 35: 31.0, 36: 32.0, 37: 33.0, 38: 35.0, 39: 36.0, 40: 37.0}, 'yos_bins': {0: 0, 1: 0, 2: 0, 3: '0-1', 4: '0-1', 5: '0-1', 6: '1-2', 7: '1-2', 8: '1-2', 9: '1-2', 10: '2-3', 11: '2-3', 12: '2-3', 13: '3-4', 14: '3-4', 15: '3-4', 16: '4-5', 17: '4-5', 18: '4-5', 19: '4-5', 20: '5-6', 21: '5-6', 22: '5-6', 23: '10-15', 24: '10-15', 25: '10-15', 26: '15-20', 27: '15-20', 28: '15-20', 29: '20-40', 30: '20-40', 31: '20-40', 32: '20-40', 33: '20-40', 34: '20-40', 35: '20-40', 36: '20-40', 37: '20-40', 38: '20-40', 39: '20-40', 40: '20-40'}})
cut_labels = ['0-1','1-2', '2-3', '3-4', '4-5', '5-6', '6-10', '10-15', '15-20', '20-40']
cut_bins = (0, 1, 2, 3, 4, 5, 6, 10, 15, 20, 40)
df_example['yos_bins'] = pd.cut(df_example['Years of service'], bins=cut_bins, labels=cut_labels)
print(df_example['yos_bins'].value_counts())
print(len(df_example['yos_bins']))
print(len(df_example))
print(df_example['yos_bins'].value_counts())
test = df_example[df_example['yos_bins'].isin(['0-1', '1-2', '2-3'])]
print('test dataframe:\n',test)
print('\n')
print('test value counts of yos_bins:\n', test['yos_bins'].value_counts())
print('\n')
dic_test = test.to_dict()
print(dic_test)
print('\n')
print(test.value_counts())ervr
I have created bins for a column with "years of service":
cut_labels = ['0-1','1-2', '2-3', '3-4', '4-5', '5-6', '6-10', '10-15', '15-20', '20-40']
cut_bins = (0, 1, 2, 3, 4, 5, 6, 10, 15, 20, 40)
df['yos_bins'] = pd.cut(df['Years of service'], bins=cut_bins, labels=cut_labels)
Then I applied .isin() to the dataframe column called 'yos_bins' with the intention to filter for a selection of column values. Excerpt from column in df.
The column I use to slice is called 'yos_bins' (i.e. binned Years of Service). I want to select only 3 ranges (0-1, 1-2, 2-3 years), but apparently there are more ranges included in the column.
To my surprise, when I apply value_counts(), I still get all values of the yos_bins column from the df dataframe (but with 0 counts).
test.yos_bins.value_counts()
Looks like this:
This was not intended, all other bins except the 3 in isin() should have been dropped. The resulting issue is that the 0 values are shown in sns.countplots, so I end up with undesired columns with zero counts.
When I save the df to_excel(), all "10-15" value fields show a "Text Date with 2-Digit Year" error. I do not load that dataframe back into python, so not sure if this could cause the problem?
Does anybody know how I can create the test dataframe that merely consists of the 3 yos_bins values instead of showing all yos_bins values, but some with zeros?

An ugly solution because numpy and pandas are misfeatured in terms of element-wise "is in". In my experience I do the comparison manually with numpy arrays.
yos_bins = np.array(df["yos_bins"])
yos_bins_sel = np.array(["0-1", "1-2", "2-3"])
mask = (yos_bins[:, None] == yos_bins_sel[None, :]).any(1)
df[mask]
Requesting as Years of service yos_bins
3 Employee 1.0 0-1
4 Ex-Employee 1.0 0-1
5 Employee 1.0 0-1
6 Employee 2.0 1-2
7 Employee 2.0 1-2
8 Ex-Employee 2.0 1-2
9 Ex-Employee 2.0 1-2
10 Employee 3.0 2-3
11 Employee 3.0 2-3
12 Ex-Employee 3.0 2-3
Explanation
(using x as yos_bins and y as yos_bins_sel)
x[:, None] == y[None, :]).all(1) is the main takeaway, x[:, None] converts x from shape to (n,) to (n, 1). y[None, :] converts y from shape (m,) to (1, m). Comparing them with == forms a broadcasted element-wise boolean array of shape (n, m), we want our array to be (n,)-shaped, so we apply .any(1) so that the second dimension is compressed to True if at least one of it's booleans is True (which is if the element is in the yos_bins_sel array). You end up with a boolean array which can be used to mask the original Data Frame. Replace x with the array containing the values to be compared and y with the array that the values of x should be contained in and you will be able to do this for any data set.

'forward replace' with pandas

I have a df where every row looks something like this:
series = pd.Series({0: 1.0,
1: 1.0,
10: nan,
11: nan,
2: 1.0,
3: 1.0,
4: nan,
5: nan,
6: nan,
7: nan,
8: nan,
9: nan,
'B': 3.0,
'D': 1.0})
For every value in D I want series[series.D] to be 2 and, all numerical values that are higher than series.D, also to be 2. It's kinda like a 'forward replace'.
So what I want is:
target = pd.Series({0: 1.0,
1: 2.0,
10: nan,
11: nan,
2: 2.0,
3: 2.0,
4: nan,
5: nan,
6: nan,
7: nan,
8: nan,
9: nan,
'B': 3.0,
'D': 1.0})
So far I've got:
def forward_replace(series):
if pd.notnull(series['D']):
cols = [0,1,2,3,4,5,6,7,8,9,10,11]
target_cols = [x for x in cols if x > series.D]
series.loc[target_cols].replace({1:2}, inplace=True)
return series
It seems like it's not possible to use label based indexing with numerical column labels?

Pandas groupby and count percentages while keeping the NaNs

I have a df with order ids and channels.
all_data_filtered = all_data[["order_id", "user_channel"]]
all_data_filtered.head(100).to_dict()
gives this as a dictionary for someone to be able to replicate it easily with pd.DataFrame(dict_bellow):
{'order_id': {0: nan,
1: nan,
2: nan,
3: nan,
4: nan,
5: nan,
6: nan,
7: nan,
8: nan,
9: nan,
10: nan,
11: nan,
12: nan,
13: 7026578.0,
14: 7026584.0,
15: 7026890.0,
16: 7026959.0,
17: 7027028.0,
18: 7027109.0,
19: 7027175.0,
20: 7027193.0,
21: nan,
22: nan,
23: nan,
24: nan,
25: nan,
26: nan,
27: nan,
28: nan,
29: nan,
30: nan,
31: 7026635.0,
32: 7026842.0,
33: nan,
34: nan,
35: nan,
36: nan,
37: nan,
38: nan,
39: nan,
40: nan,
41: nan,
42: nan,
43: nan,
44: nan,
45: 7026923.0,
46: nan,
47: nan,
48: nan,
49: nan,
50: nan,
51: nan,
52: nan,
53: nan,
54: nan,
55: nan,
56: 7026662.0,
57: 7026677.0,
58: nan,
59: nan,
60: 7722035.0,
61: nan,
62: 7026686.0,
63: 7026695.0,
64: nan,
65: 7028045.0,
66: 7026698.0,
67: 7026701.0,
68: 7026740.0,
69: 7026743.0,
70: 7033025.0,
71: nan,
72: nan,
73: nan,
74: nan,
75: 7584236.0,
76: 7584395.0,
77: nan,
78: nan,
79: nan,
80: nan,
81: nan,
82: nan,
83: nan,
84: nan,
85: 7026761.0,
86: 7027055.0,
87: 7026755.0,
88: 7026758.0,
89: 7027709.0,
90: nan,
91: nan,
92: nan,
93: nan,
94: nan,
95: 7026764.0,
96: 7026824.0,
97: 7033052.0,
98: 7033112.0,
99: 7033349.0},
'user_channel': {0: 1,
1: 2,
2: 3,
3: 1,
4: 3,
5: 3,
6: 1,
7: 3,
8: 4,
9: 1,
10: 4,
11: 2,
12: 3,
13: 3,
14: 3,
15: 3,
16: 3,
17: 3,
18: 3,
19: 3,
20: 3,
21: 3,
22: 1,
23: 1,
24: 3,
25: 1,
26: 1,
27: 1,
28: 1,
29: 3,
30: 1,
31: 3,
32: 3,
33: 3,
34: 1,
35: 3,
36: 3,
37: 2,
38: 2,
39: 3,
40: 3,
41: 1,
42: 1,
43: 2,
44: 5,
45: 5,
46: 3,
47: 2,
48: 3,
49: 3,
50: 3,
51: 1,
52: 1,
53: 3,
54: 3,
55: 3,
56: 3,
57: 3,
58: 1,
59: 1,
60: 6,
61: 3,
62: 3,
63: 3,
64: 3,
65: 3,
66: 4,
67: 4,
68: 4,
69: 4,
70: 4,
71: 3,
72: 3,
73: 1,
74: 1,
75: 3,
76: 3,
77: 3,
78: 3,
79: 3,
80: 1,
81: 3,
82: 3,
83: 4,
84: 3,
85: 3,
86: 3,
87: 4,
88: 4,
89: 4,
90: 1,
91: 3,
92: 3,
93: 5,
94: 3,
95: 3,
96: 3,
97: 3,
98: 3,
99: 3}}
Those are individual orders. So I tried to group them and count them:
all_data_filtered.groupby(["user_channel"]).agg(['count'])
which gives this and is useful because I know through channel 3 came the most orders:
order_id
count
user_channel
1 391
2 211
3 1805
4 425
5 102
6 124
7 159
but these excludes all NaN (users that came through a channel and submitted no order). So that in the end I could get some percentages: through channel 3 came 90% of the users and 10% of them placed an order which makes for 40% of all orders. How to include the NaN so that I can calculate percentages?

Try using .agg('size').
df.groupby('user_channel').agg('size')
user_channel
1 22
2 6
3 57
4 11
5 3
6 1
dtype: int64
Versus
df.groupby('user_channel').agg('count')
order_id
user_channel
1 0
2 0
3 24
4 8
5 1
6 1

Pandas Compare rows in Dataframe

I have following data frame (represented by dictionary below):
{'Name': {0: '204',
1: '110838',
2: '110999',
3: '110998',
4: '111155',
5: '111710',
6: '111157',
7: '111156',
8: '111144',
9: '118972',
10: '111289',
11: '111288',
12: '111145',
13: '121131',
14: '118990',
15: '110653',
16: '110693',
17: '110694',
18: '111577',
19: '111702',
20: '115424',
21: '115127',
22: '115178',
23: '111578',
24: '115409',
25: '115468',
26: '111711',
27: '115163',
28: '115149',
29: '115251'},
'Sequence_new': {0: 1.0,
1: 2.0,
2: 3.0,
3: 4.0,
4: 5.0,
5: 6.0,
6: 7.0,
7: 8.0,
8: 9.0,
9: 10.0,
10: 11.0,
11: 12.0,
12: nan,
13: 13.0,
14: 14.0,
15: 15.0,
16: 16.0,
17: 17.0,
18: 18.0,
19: 19.0,
20: 20.0,
21: 21.0,
22: 22.0,
23: 23.0,
24: 24.0,
25: 25.0,
26: 26.0,
27: 27.0,
28: 28.0,
29: 29.0},
'Sequence_old': {0: 1,
1: 2,
2: 3,
3: 4,
4: 5,
5: 6,
6: 7,
7: 8,
8: 9,
9: 10,
10: 11,
11: 12,
12: 13,
13: 14,
14: 15,
15: 16,
16: 17,
17: 18,
18: 19,
19: 20,
20: 21,
21: 22,
22: 23,
23: 24,
24: 25,
25: 26,
26: 27,
27: 28,
28: 29,
29: 30}}
I am trying to understand what changed between old and new sequences. If by Name Sequence_old = Sequence_new, nothing changed. If Sequence+_new is 'nan', Name removed. Can you please help implement this in pandas?
What tried till now without success:
for i in range(0, len(Merge)):
if Merge.iloc[i]['Sequence_x'] == Merge.iloc[i]['Sequence_y']:
Merge.iloc[i]['New'] = 'N'
else:
Merge.iloc[i]['New'] = 'Y'
Thank you

You can use double numpy.where with condition with isnull:
mask = df.Sequence_old == df.Sequence_new
df['New'] = np.where(df.Sequence_new.isnull(), 'Removed',
np.where(mask, 'N', 'Y'))
print (df)
Name Sequence_new Sequence_old New
0 204 1.0 1 N
1 110838 2.0 2 N
2 110999 3.0 3 N
3 110998 4.0 4 N
4 111155 5.0 5 N
5 111710 6.0 6 N
6 111157 7.0 7 N
7 111156 8.0 8 N
8 111144 9.0 9 N
9 118972 10.0 10 N
10 111289 11.0 11 N
11 111288 12.0 12 N
12 111145 NaN 13 Removed
13 121131 13.0 14 Y
14 118990 14.0 15 Y
15 110653 15.0 16 Y
16 110693 16.0 17 Y
17 110694 17.0 18 Y
18 111577 18.0 19 Y
19 111702 19.0 20 Y
20 115424 20.0 21 Y
21 115127 21.0 22 Y
22 115178 22.0 23 Y
23 111578 23.0 24 Y
24 115409 24.0 25 Y
25 115468 25.0 26 Y
26 111711 26.0 27 Y
27 115163 27.0 28 Y
28 115149 28.0 29 Y
29 115251 29.0 30 Y

dic_new = {0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0, 4: 5.0, 5: 6.0, 6: 7.0, 7: 8.0, 8: 9.0, 9: 10.0, 10: 11.0, 11: 12.0,
12: 'Nan', 13: 13.0, 14: 14.0, 15: 15.0, 16: 16.0, 17: 17.0, 18: 18.0, 19: 19.0, 20: 20.0, 21: 21.0,
22: 22.0, 23: 23.0, 24: 24.0, 25: 25.0, 26: 26.0, 27: 27.0, 28: 28.0, 29: 29.0}
dic_old = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16,
16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29,
29: 30}
# Does the same thing as the code below
for a, b in zip(dic_new.items(), dic_old.items()):
if b[1].lower() != 'nan':
# You can add whatever print statement you want here
print(a[1] == b[1])
# Does the same thing as the code above
[print(a[1] == b[1]) for a, b in zip(dic_new.items(), dic_old.items()) if b[1].lower() != 'nan']

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Explode data in a list not separated by comma - python

Use Series.str.strip with Series.str.split for both columns and then DataFrame.explode: df = (df.assign(newa_cited_docdb=df['newa_cited_docdb'].str.strip('[]').str.split(), cited_docdb_ls=df['cited_docdb_ls'].str.strip('[]').str.split()) .explode(['newa_cited_docdb','cited_docdb_ls']))

Related

How to get categories of words containing unique 3-letter set from the columns of pandas dataframe in Python?

Column Values still shown after .isin()

'forward replace' with pandas

Pandas groupby and count percentages while keeping the NaNs

Pandas Compare rows in Dataframe

Categories

Resources