Applying regex to dataframe column based on value in another column - python

I have this regex_func helper function below that has been working well to extract a match from a df column using map and lambda.
def regex_func(regex_compile,x,item=0,return_list=False):
"""Function to handle list returned by re.findall()
Takes the first value of the list.
If empty list, returns empty string"""
match_list = regex_compile.findall(x)
if return_list:
match = match_list
elif match_list:
try:
match = match_list[item]
except:
match = ""
else:
match = ""
return match
#Working example
regex_1 = re.compile('(?i)(?<=\()[^ ()]+')
df['colB'] = df['colA'].map(lambda x: regex_func(regex_1, x))
I am having trouble doing a similar task. I want the regex to be based on a value in another column and then applied. One method I was trying that did not work:
# Regex should be based on value in col1
# Extracting that value and prepping to input into my regex_func()
value_list = df['col1'].tolist()
value_list = ['(?i)(?<=' + d + ' )[^ ]+' for d in value_list]
value_list = [re.compile(d) for d in value_list]
# Adding prepped list back into df as col2
df.insert(1,'col2',value_list)
#Trying to create col4, based on applying my re.compile in col 2 to a value in col3.
df.insert(2,'col4', df['col3'].map(lambda x: df['col2'],x)
I understand why the above doesn't work, but have not been able to find a solution.

You can zip the columns and then build the regex on the fly:
df['colB'] = [regex_func('(?i)(?<=' + y + ' )[^ ]+', x)
for x, y in zip(df['colA'], df['col1'])]

Related

want to apply merge function on column A

How can I apply merge function or any other method on column A.
For example in layman term I want to convert this string "(A|B|C,D)|(A,B|C|D)|(B|C|D)" into a
"(D A|D B|D C)|(A B|A C|A D)|(B|C|D)"
This (B|C|D) will remain same as it doesn't have comma value to merge in it. Basically I want to merge the values which are in commas to rest of its other values.
I have below data frame.
import pandas as pd
data = {'A': [ '(A|B|C,D)|(A,B|C|D)|(B|C|D)'],
'B(Expected)': [ '(D A|D B|D C)|(A B|A C|A D)|(B|C|D)']
}
df = pd.DataFrame(data)
print (df)
My expected result is mentioned in column B(Expected)
Below method I tried:-
(1)
df['B(Expected)'] = df['A'].apply(lambda x: x.replace("|", " ").replace(",", "|") if "|" in x and "," in x else x)
(2)
# Split the string by the pipe character
df['string'] = df['string'].str.split('|')
df['string'] = df['string'].apply(lambda x: '|'.join([' '.join(i.split(' ')) for i in x]))
You can use a regex to extract the values in parentheses, then a custom function with itertools.product to reorganize the values:
from itertools import product
def split(s):
return '|'.join([' '.join(x) for x in product(*[x.split('|') for x in s.split(',')])])
df['B'] = df['A'].str.replace(r'([^()]+)', lambda m: split(m.group()), regex=True)
print(df)
Note that this requires non-nested parentheses.
Output:
A B
0 (A|B|C,D)|(A,B|C|D)|(B|C|D) (A D|B D|C D)|(A B|A C|A D)|(B|C|D)

how to extract all repeating patterns from a string into a dataframe

i have a dataframe with the equiptment codes of certain trucks, this is a similar list o list of the cells
x = [[A0B,A1C,A1Z,A2E,A5C,B1B,B1F,B1H,B2A],
[A0A,A0B,A1C,A1Z,A2I,A5L,B1B,B1F,B1H,B2A,B2X,B3H,B4L,B5E,B5J,C0G,C1W,C5B,C5D],
[A0B,A1C,A1Z,A2E,A5C,B1B,B1F,B1H,B2A,B2X,B4L,B5C,B5I,C0A,C1J,C5B,C5D,C6C,C6J,C6Q]]
i want to extract all the values with match with "B" for example ("B1B,B1F,B1H");("B1B,B1F,B1H,B2A,B2X,B3H")("B1B,B1F,B1H,B2A,B2X,B4L,B5C,B5I") i try this code but every row each line has a different length
sublista = ['B1B','B1F','B1H','B2A','B2X','B4L','B5C','B5I']
df3 = pd.DataFrame(columns=['FIN', 'Equipmentcodes', 'AQUATARDER', 'CAJA'])
for elemento in sublista:
df_aux=(df2[df2['Equipmentcodes'].str.contains(elemento, case=False)])
df_aux['CAJA'] = elemento
df3 = df3.append(df_aux, ignore_index=True)
Assuming your column contains strings, you could use a regex:
df['selected'] = (df['code']
.str.extractall(r'\b(B[^,]*)\b')[0]
.groupby(level=0).apply(','.join)
)
example input:
x = ['A0B,A1C,A1Z,A2E,A5C,B1B,B1F,B1H,B2A',
'A0A,A0B,A1C,A1Z,A2I,A5L,B1B,B1F,B1H,B2A,B2X,B3H,B4L,B5E,B5J,C0G,C1W,C5B,C5D',
'A0B,A1C,A1Z,A2E,A5C,B1B,B1F,B1H,B2A,B2X,B4L,B5C,B5I,C0A,C1J,C5B,C5D,C6C,C6J,C6Q']
df = pd.DataFrame({'code': x})
output:
selected code
0 B1B,B1F,B1H,B2A A0B,A1C,A1Z,A2E,A5C,B1B,B1F,B1H,B2A
1 B1B,B1F,B1H,B2A,B2X,B3H,B4L,B5E,B5J A0A,A0B,A1C,A1Z,A2I,A5L,B1B,B1F,B1H,B2A,B2X,B3H,B4L,B5E,B5J,C0G,C1W,C5B,C5D
2 B1B,B1F,B1H,B2A,B2X,B4L,B5C,B5I A0B,A1C,A1Z,A2E,A5C,B1B,B1F,B1H,B2A,B2X,B4L,B5C,B5I,C0A,C1J,C5B,C5D,C6C,C6J,C6Q

Replace string in pandas dataframe if it contains specific substring

I have a dataframe generated from a .csv (I use Python 3.5). The df['category'] contains only strings. What I want is to check this column and if a string contains a specific substring(not really interested where they are in the string as long as they exist) to be replaced. I am using this script
import pandas as pd
df=pd.read_csv('lastfile.csv')
df.dropna(inplace=True)
g='Drugs'
z='Weapons'
c='Flowers'
df.category = df.category.str.lower().apply(lambda x: g if ('mdma' or 'xanax' or 'kamagra' or 'weed' or 'tabs' or 'lsd' or 'heroin' or 'morphine' or 'hci' or 'cap' or 'mda' or 'hash' or 'kush' or 'wax'or 'klonop'or\
'dextro'or'zepam'or'amphetamine'or'ketamine'or 'speed' or 'xtc' or 'XTC' or 'SPEED' or 'crystal' or 'meth' or 'marijuana' or 'powder' or 'afghan'or'cocaine'or'haze'or'pollen'or\
'sativa'or'indica'or'valium'or'diazepam'or'tablet'or'codeine'or \
'mg' or 'dmt'or'diclazepam'or'zepam'or 'heroin' ) in x else(z if ('weapon'or'milit'or'gun'or'grenades'or'submachine'or'rifle'or'ak47')in x else c) )
print(df['category'])
My problem is that some records though they contain some of the substrings I defined, do not get replaced. Is it a regex related problem?
Thank you in advance.
Create dictionary of list of substrings with key for replace strings, loop it and join all list values by | for regex OR, so possible check column by contains and replace matched rows with loc:
df = pd.DataFrame({'category':['sss mdma df','milit ss aa','aa ss']})
a = ['mdma', 'xanax' , 'kamagra']
b = ['weapon','milit','gun']
g='Drugs'
z='Weapons'
c='Flowers'
d = {g:a, z:b}
df['new_category'] = c
for k, v in d.items():
pat = '|'.join(v)
mask = df.category.str.contains(pat, case=False)
df.loc[mask, 'new_category'] = k
print (df)
category new_category
0 sss mdma df Drugs
1 milit ss aa Weapons
2 aa ss Flowers

How to find a string in data frame which has the text with underscore in between

In Data frame I have values For Ex as below :
Rhymes(column name)
Johny johny.yes_papa eating
sugar
No papa.open_mouth_ha ha ha
The output should be the list of all the strings which has underscores like:
yes_papa,
open_mouth_ha
I am trying with the below code but I am only getting column names in the dataframe:
df[df["Rhymes"].str.contains("_&_&_",na =False)]
Should I be using regular expressions to get the exact match
The method pd.Series.str.contains returns a Boolean series, it doesn't return the strings you want.
You can instead use a custom function with str.split, apply this to your series, drop nulls and convert back to a dataframe:
df = pd.DataFrame({'Rhymes': ['Johny johny.yes_papa eating', 'sugar',
'No papa.open_mouth_ha ha ha']})
def get_underscores(x):
return next((i for i in x.replace('.',' ').split() if '_' in i), None)
res = df['Rhymes'].apply(get_underscores).dropna().to_frame()
print(res)
Rhymes
0 yes_papa
2 open_mouth_ha
For a string, it should work like this.
string = "Johny johny yes_papa eating sugar No papa open_mouth_ha ha ha"
def find_underscore(string):
a = []
for i in string.split():
for j in i:
if j == '_':
a.append(i)
return a
for a column of data frame:
new_list = []
for index, row in df.iterrows():
print(find_underscore(row["column_name"]))
new_list.append(find_underscore(row["column_name"]))
df.new_column = new_list
Try this to get rows containing an underscore:
df[df["Rhymes"].str.contains("_")]
Or this to get just values:
df.loc[df["Rhymes"].str.contains("_"), "Rhymes"].values

How to get the row index for pandas apply function on a Series

I have a DataFrame that I split into column Series (col_series in the snippet below)and use apply tests to each value in each Series. But I would like to report which row in the Series is affected when I detect and error.
...
col_series.apply(self.testdatelimits, args= \
(datetime.strptime('2018-01-01', '%Y-%m-%d'), key))
def testlimits(self, row_id, x, lowerlimit, col_name):
low_error = None
d = float(x)
if lowerlimit != 'NA' and d < float(lowerlimit):
low_error = 'Following record has column ' + col_name + ' lower than range check'
if low_error is not None:
self.set_error(col_index, row_id, low_error)
Of course the above fails because x is a str and does not have the name property. I am thinking that maybe I can pass in the row index in the Series, but am not clear on how to do that?
Edit:
I switched to use a list comprehension to solve this issue rather than ps apply. It is significantly faster too
col_series = col_series.apply(pd.to_datetime, errors='ignore')
dfwithrow = pd.DataFrame(col_series)
dfwithrow.insert(0, 'rowid', range(0, len(dfwithrow)))
dfwithrow['lowerlimit'] = lowlimit
dfwithrow['colname'] = 'fred'
list(map(self.testdatelimits, dfwithrow['rowid'], dfwithrow[colvalue[0]], \
dfwithrow['lowerlimit'], dfwithrow['colname']))

Categories

Resources