How can I create string indexing instead of numbers from dataframe? - python

I want to create unique row identifiers , in place of the index column, from the contents present in the columns of a dataframe.
For example,
import pandas as pd
from pprint import pprint
df = pd.DataFrame(columns=["ID", "Animal", "Weight", "Description"])
df["ID"] = ["Qw9457", "gft878"]
df["Animal"] = ["Mouse", "Lion"]
df["Weight"] = [20, 67]
df["Description"] = ["hsdg rie", "gtre sjdhi"]
pprint(df)
Output:
ID Animal Weight Description
0 Qw9457 Mouse 20 hsdg rie
1 gft878 Lion 67 gtre sjdhi
I'd prefer to rename the index column using the contents present in the rest of the columns,
for example :
df.index = ["MQwrie", "Lgfgt"]
I would like to know if there are nice ways to programmatically generate
row identifiers(i.e index column) from the contents of columns.

If you are looking to generate an index based on bits of the data in each column, you can piece it together using Series operations and then assign the index. Below, we use the first letter of the animal's name, the weight, and the first word of the description as a new index.
import pandas as pd
df = pd.DataFrame({'ID': ['Qw9457', 'gft878'],
'Animal': ['Mouse', 'Lion'],
'Weight': [20, 67],
'Description': ['hsdg rie', 'gtre sjdhi']})
# create new index from data in df, assign as index
ix = df.Animal.str[0] + df.Weight.astype(str) + df.Description.str.split().str.get(0)
df_new = df.set_index(ix)
df_new
# returns:
ID Animal Weight Description
M20hsdg Qw9457 Mouse 20 hsdg rie
L67gtre gft878 Lion 67 gtre sjdhi
EDIT:
Yes, you add the current row number (starting at zero), you can use:
ix = (
df.Animal.str[0]
+ df.Weight.astype(str)
+ df.Description.str.split().str.get(0)
+ df.index.astype(str).str.zfill(3)
)
df_new = df.set_index(ix)
df_new
#returns:
ID Animal Weight Description
M20hsdg000 Qw9457 Mouse 20 hsdg rie
L67gtre001 gft878 Lion 67 gtre sjdhi

Related

Pandas: If two or more rows have same values in particular column then get its count and add to next row

Sort data frame by values of 5th column ["Card"] and add a new row after each card nos with its count. After sorting values how can I add a new row with Total:
Dataframe looks something like this
This is how I want output data frame
You can give this a try:
import pandas as pd
# create dummy df
card = ["2222","2222","1111","2222","1111","3333"]
name = ["Ed", "Ed", "John", "Ed", "John", "Kevin"]
phone = ["1##-###-####", "1##-###-####", "2##-###-####", "1##-###-####", "2##-###-####", "3##-###-####"]
df = pd.DataFrame({"Name":name, "Phone":phone, "Card":card})
# sort by Card value
df = df.sort_values(by=["Card"]).reset_index(drop=True)
# Groupby the Card value, count them, then insert a new row based on that count
index = 0
line = []
for x in df.groupby("Card").size():
index += x
line.append(pd.DataFrame({"Name": "", "Phone":"", "Card": str(x)}, index=[index]))
df = df.append(line, ignore_index=False)
df = df.sort_values(by=["Card"]).sort_index().reset_index(drop=True)
df
Output:
Name Phone Card
0 Ed 1##-###-#### 1111
1 Ed 1##-###-#### 1111
2 Ed 1##-###-#### 1111
3 3
4 John 2##-###-#### 2222
5 John 2##-###-#### 2222
6 2
7 Kevin 3##-###-#### 3333
8 1
Edit ~~~~
Due to OP's use of string for card numbers, an edit had to be made to account for naturally sorting string ints
import pandas as pd
from natsort import natsort_keygen ##### Now needed because OP has Card numbers as strings
# create dummy df ##############
card = ["1111", "2222", "3333", "4444", "5555", "6666", "7777", "8888"]
name = ["Ed", "John", "Jake", "Mike", "Liz", "Anne", "Deb", "Steph"]
phone = ["1###", "2###", "3###", "4###", "5###", "6###", "7###", "8###"]
dfList = [a for a in zip(name, phone, card)]
dfList = [dfList[random.randrange(len(dfList))] for i in range(50)]
df = pd.DataFrame(dfList, columns=["Name", "Phone", "Card"])
################################
# sort by Card value
df = df.sort_values(by=["Card"]).reset_index(drop=True)
# Groupby the Card value, count them, then insert a new row based on that count
index = 0
line = []
for x in df.groupby("Card").size():
index += x
line.append(pd.DataFrame({"Name": "", "Phone":"", "Card": str(x)}, index=[index-1]))
df = pd.concat([df, pd.concat(line)], ignore_index=False)
# Create an Index column to be used in the by pandas sort_values
df["Index"] = df.index
# Sort the values first by index then by card number, use "natsort_keygen()" to naturally sort ints that are strings
df = df.sort_values(by = ['Index', 'Card'], key=natsort_keygen(), ascending = [True, False]).reset_index(drop=True).drop(["Index"], axis=1)
I'm not sure if this is the best way but it worked for me.
list_of_df = [v for k, v in df.groupby("card")]
for i in enumerate(list_of_df):
list_of_df[i[0]] = list_of_df[i[0]].append({"card":str(list_of_df[i[0]].shape[0])}, ignore_index=True)
final_df = pd.concat(list_of_df)

How to group by the words and create an equivalent column consisting of float values? (Pandas)

I have a dataframe:
Text
Background
Clinical
Method
Direct
Background
Direct
Now I want to group them in new column according to their first words like Background belong to group 1 Clinical belongs to group 2 and like this.
The expected output:
a dataframe:
Text Group
Background 1
Clinical 2
Method 3
Direct 4
Background 1
Direct 4
Try this:
import pandas as pd
text = ['Background', 'Clinical', 'Method', 'Direct', 'Background', 'Direct']
df = pd.DataFrame(text, columns=['Text'])
def create_idx_map():
idx = 1
values = {}
for item in list(df['Text']):
if item not in values:
values[item] = idx
idx += 1
return values
values = create_idx_map()
df['Group'] = [values[x] for x in list(df['Text'])]
print(df)
Idea: Make a list of unique values of the column Text and for the column Group you can assign the index of the value in this unique list. Code example:
df = pd.DataFrame({"Text": ["Background", "Clinical", "Clinical", "Method", "Background"]})
# List of unique values of column `Text`
groups = list(df["Text"].unique())
# Assign each value in `Text` its index
# (you can write `groups.index(text) + 1` when the first value shall be 1)
df["Group"] = df["Text"].map(lambda text: groups.index(text))
# Ouptut for df
print(df)
### Result:
Text Group
0 Background 0
1 Clinical 1
2 Clinical 1
3 Method 2
4 Background 0
A solution could be the following:
import pandas as pd
data = pd.DataFrame([["A B", 1], ["A C", 2], ["B A", 3], ["B C", 5]], columns=("name", "value"))
data.groupby(by=[x.split(" ")[0] for x in data.loc[:,"name"]])
You can select the first few words using x.split(" ")[:NUMBER_OF_WORDS]. You then apply the aggregation you want to the need object

change the first occurrence in a pandas column based on certain condition

so I would like to change the first number in the number column to +233 given the first number is 0, basically I would like all rows in number to be like that of row Paul
Both columns are string objects.
Expectations:
The first character of the values in the column named df["numbers"] should be replaced to "+233" if only == "0"
df = pd.DataFrame([[ken, 080222333222],
[ben, +233948433],
[Paul, 0800000073]],
columns=['name', 'number'])`
Hope I understood your edition, try this:
Notice - I removed the first 0 and replaced it with +233
import pandas as pd
df = pd.DataFrame([["ken", "080222333222"], ["ben", "+233948433"], ["Paul", "0800000073"]], columns=['name', 'number'])
def convert_number(row):
if row[0] == '0':
row = "+233" + row[1:]
return row
df['number'] = df['number'].apply(convert_number)
print(df)
You can used replace directly
df['relace_Col']=df.number.str.replace(r'^\d', '+233',1)
which produced
name number relace_Col
0 ken 080222333222 +23380222333222
1 ben +233948433 +233948433
2 Paul 0800000073 +233800000073
The full code to reproduce the above
import pandas as pd
df = pd.DataFrame([['ken', '080222333222'], ['ben', '+233948433'],
['Paul', '0800000073']], columns=['name', 'number'])
df['relace_Col']=df.number.str.replace(r'^\d', '+233',1)
print(df)

How to create a pivot table where both the index and the column are the unique values of the same df column?

I have a df as follows:
QUESTIONCODE SUBJETCS
1 English
1 French
2 English
3 Japanese
4 English
4 Japanese
And I would like to create a pivot table where both the index and the column would be my SUBJECTS unique values from df, and it would be filled by the number of QUESTIONCODE's belonging to the combination of SUBJECTs represented by each index and column. Then, the result would be:
English French Japanese
English 3 1 1
French 1 1 0
Japanese 1 0 2
I have already tried some ways using pandas functions as groupby, pivot_table and crosstab, but I still could not get the result shown above.
Could anyone please help me on that?
I was able to find a solution, but by no means it is the best solution. I believe it will allow to get started. I provided some comments with the code. Let me know if you have any questions. Cheers.
import pandas as pd
import itertools
import collections
# this is your data
df = pd.DataFrame({"QUESTIONCODE": [1,1,2,3,4,4],
"SUBJETCS": ["English", "French", "English", "Japanese", "English", "Japanese"]})
df["SUBJETCS_"] = df["SUBJETCS"] # I am duplicating the subject column here
# pivoting to get the counts of each subject
dfPivot = df.pivot_table(index="SUBJETCS", columns="SUBJETCS_", values="QUESTIONCODE", aggfunc=["count"], fill_value=0).reset_index()
dfPivot.columns = ["SUBJETCS"] + sorted(df["SUBJETCS"].unique())
x = df.groupby("QUESTIONCODE")["SUBJETCS"].apply(",".join).reset_index() # for each QUESTIONCODE taking its subjects as a DataFrame
dictCombos = dict(zip(x["QUESTIONCODE"].tolist(), [s.split(",") for s in x["SUBJETCS"].tolist()])) # this will map QUESTIONCODE to its subject as a dictionary
list_all_pairs = [] # this will have all possible pair of subjects
for k, v in dictCombos.items():
# v = list(set(v.split(",")))
prm = list(itertools.permutations(v, 2))
list_all_pairs.extend(prm)
dictMap = {c: i for i, c in enumerate(dfPivot.columns[1:])} # just maps each subject to an index
dictCounts = dict(collections.Counter(list_all_pairs)) # dictionary of all pairs to its counts
dictCoords = {} # indexing each subjects i.e. English 0, French 1, ..., this will allow to load as matrix
for pairs, counts in dictCounts.items():
coords = (dictMap[pairs[0]], dictMap[pairs[1]])
dictCoords[coords] = counts
x = dfPivot.iloc[:, 1:].values # saving the content of the pivot into an 2 dimensional array
for coords, counts in dictCoords.items():
x[coords[0], coords[1]] = counts
dfCounts = pd.DataFrame(x, columns=dfPivot.columns[1:]) # loading the content of array into a DataFrame
df = pd.concat([dfPivot.iloc[:, 0], dfCounts], axis=1) # and finally putting it all together
That's the code I mentioned on the comment of griggy's ANSWER!
import pandas as pd
import itertools
import collections
# Creating the dataframe
df = pd.DataFrame({"QUESTIONCODE": [1,1,2,3,4,4],
"SUBJETCS": ["English", "French", "English", "Japanese", "English", "Japanese"]})
# Pivoting to get the counts of each subject
dfPivot = pd.pivot_table(df, values='QUESTIONCODE', index='SUBJETCS', columns='SUBJETCS', aggfunc='count', fill_value=0)
# Creating a dataframe with each QUESTIONCODE and its SUBJECTs
x = df.groupby("QUESTIONCODE")["SUBJETCS"].apply(",".join).reset_index()
# Mapping QUESTIONCODE to its SUBJECTs as a dictionary
dictCombos = dict(zip(x["QUESTIONCODE"].tolist(), [s.split(",") for s in x["SUBJETCS"].tolist()]))
# Creating a list with all possible pair of SUBJECTs
list_all_pairs = []
for k, v in dictCombos.items():
prm = list(itertools.permutations(v, 2))
list_all_pairs.extend(prm)
# Creating a dictionary of all pairs of Materias and its counts
dictCounts = dict(collections.Counter(list_all_pairs))
# Filling the dfPivot dataframe with all pairs of Materias and its counts
for pairs, counts in dictCounts.items():
dfPivot.loc[pairs] = counts

Appending Multi-index column headers to existing dataframe

I'm looking to append a multi-index column headers to an existing dataframe, this is my current dataframe.
Name = pd.Series(['John','Paul','Sarah'])
Grades = pd.Series(['A','A','B'])
HumanGender = pd.Series(['M','M','F'])
DogName = pd.Series(['Rocko','Oreo','Cosmo'])
Breed = pd.Series(['Bulldog','Poodle','Golden Retriever'])
Age = pd.Series([2,5,4])
DogGender = pd.Series(['F','F','F'])
SchoolName = pd.Series(['NYU','UCLA','UCSD'])
Location = pd.Series(['New York','Los Angeles','San Diego'])
df = (pd.DataFrame({'Name':Name,'Grades':Grades,'HumanGender':HumanGender,'DogName':DogName,'Breed':Breed,
'Age':Age,'DogGender':DogGender,'SchoolName':SchoolName,'Location':Location}))
I want add 3 columns on top of the existing columns I already have. For example, columns [0,1,2,3] should be labeled 'People', columns [4,5,6] should be labeled 'Dogs', and columns [7,8] should be labeled 'Schools'. In the final result, it should be 3 columns on top of 9 columns.
Thanks!
IIUC, you can do:
newlevel = ['People']*4 + ['Dogs']*3 + ['Schools']*2
df.columns = pd.MultiIndex.from_tuples([*zip(newlevel, df.columns)])
Note [*zip(newlevel, df.columns)] is equivalent to
[(a,b) for a,b in zip(new_level, df.columns)]

Categories

Resources