How to replace substrings in a dataframe in Python - python

I have a dataframe, where I want to replace some words to others, based on another dataframe:
import pandas as pd
dist = pd.DataFrame([["21","apple"],["25","balana"],["30","lemon"]],columns=["idx","item"])
a = pd.DataFrame(["apple - banana"],columns=["pf"])
a['pf'] = a['pf'].replace(dist["item"], dist["idx"], regex=True)
print(a)
How can I do that? (this does not work in its current form)

You can try this:
dist = pd.DataFrame([["21","apple"],["25","balana"],["30","lemon"]],columns= ["idx","item"])
a = pd.DataFrame(["apple - banana"],columns=["pf"])
b = dict(zip(dist["idx"], dist["item"]))
def replace_items(token):
for key, value in b.items():
token = token.replace(value, key)
return token
a["pf"] = a["pf"].apply(replace_items)
Please be aware that the banana in your dist dataframe is balana. Not sure if this is intended...

Converting the translation table to dictionary seems to solve the problem:
import pandas as pd
dist = pd.DataFrame([["apple","21"],["banana","25"],["lemon","30"]],columns=["item","idx"])
dist = dist.set_index('item')['idx'].to_dict()
a = pd.DataFrame(["apple - banana"],columns=["pf"])
a['pf'] = a['pf'].replace(dist, regex=True)
print(a)

Related

why do i get a key error from output when i do a merge

hi please help me I am trying to fuzzy merge using pandas and fuzzywuzzy on two datasets using two columns from each, but I get a traceback at the line before the print function that says KeyError: ('name', 'lasntname'), I do not know if I am referencing wrong or what, I have tried the double brackets and parenthesis no luck
heres the code
import pandas as pd
from fuzzywuzzy import fuzz, process
from itertools import product
N = 80
names = {tup: fuzz.ratio(*tup) for tup in
product(df1["Name"].tolist(),
df2["name"].tolist())}
s1 = pd.Series(names)
s1 = s1[s1 > N]
s1 = s1[s1.groupby(level=0).idxmax()]
surnames = {tup: fuzz.ratio(*tup) for tup in
product(df1["Last_name"].tolist(),
df2["lasntname"].tolist())}
s2 = pd.Series(surnames)
s2 = s2[s2 > N]
s2 = s2[s2.groupby(level=0).idxmax()]
# map and fill nulls
df2["name"] =
df2["name"].map(s1).fillna(df2["name"])
df2["lasntname"] =
df2["lasntname"].map(s2).fillna(df2["lasntname"])
df = df1.merge(df2, on=["name", "lasntname"],
how='outer')
print(df)
Hi Just make your Column names uniform on both tables should work

Replace cells in a dataframe with a range of values

I have a large dataframe that has certain cells which have values like: <25-27>. Is there a simple way to convert these into something like:25|26|27 ?
Source data frame:
import pandas as pd
import numpy as np
f = {'function':['2','<25-27>','200'],'CP':['<31-33>','210','4001']}
filter = pd.DataFrame(data=f)
filter
Output Required
output = {'function':['2','25|26|27','200'],'CP':['31|32|33','210','4001']}
op = pd.DataFrame(data=output)
op
thanks a lot !
import re
def convert_range(x):
m = re.match("<([0-9]+)+\-([0-9]+)>", x)
if m is None:
return x
s1, s2 = m.groups()
return "|".join([str(s) for s in range(int(s1), int(s2)+1)])
op = filter.applymap(convert_range)

Pandas: Replacing string with hashed string via regex

I have a DataFrame with 29 columns, and need to replace part of a string in some columns with a hashed part of the string.
Example of the column is as follows:
ABSX, PLAN=PLAN_A ;SFFBJD
ADSFJ, PLAN=PLAN_B ;AHJDG
...
...
Code that captures the part of the string:
Test[14] = Test[14].replace({'(?<=PLAN=)(^"]+ ;)' :'hello'}, regex=True)
I want to change the 'hello' to hash of '(?<=PLAN=)(^"]+ ;)' but it doesn't work this way. Wanted to check if anyone did this before without looping line by line of the DataFrame?
here is what I suggest:
import hashlib
import re
import pandas as pd
# First I reproduce a similar dataset
df = pd.DataFrame({"v1":["ABSX", "ADSFJ"],
"v2": ["PLAN=PLAN_A", "PLAN=PLAN_B"],
"v3": ["SFFBJD", "AHJDG"]})
# I search for the regex and create a column matched_el with the hash
r = re.compile(r'=[a-zA-Z_]+')
df["matched_el"] = ["".join(r.findall(w)) for w in df.v2]
df["matched_el"] = df["matched_el"].str.replace("=","")
df["matched_el"] = [hashlib.md5(w.encode()).hexdigest() for w in df.matched_el]
# Then I replace in v2 using this hash
df["v2"] = df["v2"].str.replace("(=[a-zA-Z_]+)", "=")+df["matched_el"]
df = df.drop(columns="matched_el")
Here is the result
v1 v2 v3
0 ABSX PLAN=8d846f78aa0b0debd89fc1faafc4c40f SFFBJD
1 ADSFJ PLAN=3b9a3c8184829ca5571cb08c0cf73c8d AHJDG

Python Pandas rolling mean DataFrame Constructor not properly called

I am trying to create a simple time-series, of different rolling types. One specific example, is a rolling mean of N periods using the Panda python package.
I get the following error : ValueError: DataFrame constructor not properly called!
Below is my code :
def py_TA_MA(v, n, AscendType):
df = pd.DataFrame(v, columns=['Close'])
df = df.sort_index(ascending=AscendType) # ascending/descending flag
M = pd.Series(df['Close'].rolling(n), name = 'MovingAverage_' + str(n))
df = df.join(M)
df = df.sort_index(ascending=True) #need to double-check this
return df
Would anyone be able to advise?
Kind regards
found the correction! It was erroring out (new error), where I had to explicitly declare n as an integer. Below, the code works
#xw.func
#xw.arg('n', numbers = int, doc = 'this is the rolling window')
#xw.ret(expand='table')
def py_TA_MA(v, n, AscendType):
df = pd.DataFrame(v, columns=['Close'])
df = df.sort_index(ascending=AscendType) # ascending/descending flag
M = pd.Series(df['Close'], name = 'Moving Average').rolling(window = n).mean()
#df = pd.Series(df['Close']).rolling(window = n).mean()
df = df.join(M)
df = df.sort_index(ascending=True) #need to double-check this
return df

what is the source of this error: python pandas

import pandas as pd
census_df = pd.read_csv('census.csv')
#census_df.head()
def answer_seven():
census_df_1 = census_df[(census_df['SUMLEV'] == 50)].set_index('CTYNAME')
census_df_1['highest'] = census_df_1[['POPESTIAMTE2010','POPESTIAMTE2011','POPESTIAMTE2012','POPESTIAMTE2013','POPESTIAMTE2014','POPESTIAMTE2015']].max()
census_df_1['lowest'] =census_df_1[['POPESTIAMTE2010','POPESTIAMTE2011','POPESTIAMTE2012','POPESTIAMTE2013','POPESTIAMTE2014','POPESTIAMTE2015']].min()
x = abs(census_df_1['highest'] - census_df_1['lowest']).tolist()
return x[0]
answer_seven()
This is trying to use the data from census.csv to find the counties that have the largest absolute change in population within 2010-2015(POPESTIMATES), I wanted to simply find the difference between abs.value of max and min value for each year/column. You must return a string. also [(census_df['SUMLEV'] ==50)] means only counties are taken as they are set to 50. But the code gives an error that ends with
KeyError: "['POPESTIAMTE2010' 'POPESTIAMTE2011' 'POPESTIAMTE2012'
'POPESTIAMTE2013'\n 'POPESTIAMTE2014' 'POPESTIAMTE2015'] not in index"
Am I indexing the wrong data structure? I'm really new to datascience and coding.
I think the column names in the code have typo. The pattern is 'POPESTIMATE201?' and not 'POPESTIAMTE201?'
Any help with shortening the code will be appreciated. Here is the code that works -
census_df = pd.read_csv('census.csv')
def answer_seven():
cdf = census_df[(census_df['SUMLEV'] == 50)].set_index('CTYNAME')
columns = ['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014', 'POPESTIMATE2015']
cdf['big'] = cdf[columns].max(axis =1)
cdf['sml'] = cdf[columns].min(axis =1)
cdf['change'] = cdf[['big']].sub(cdf['sml'], axis=0)
return cdf['change'].idxmax()

Categories

Resources