How to generate leters insted of number in pandas multiindex? - python

I have multiindex table in pandas and I need to generate some string values instead of numbers. I need data to generate letters from a to z, and to be random. How do I do it?
import pandas as pd
import numpy as np
index = pd.MultiIndex.from_product([[2020], [1, 2, 3, 4]],
names=['year', 'q'])
columns = pd.MultiIndex.from_product([['Items1', 'Items2', 'Items3'], ['new', 'old']],
names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 20
data += 50
Ldata = pd.DataFrame(data, index=index, columns=columns)
Ldata

I figured out.
data = np.random.seed(123)
data = list(np.random.choice(list(string.ascii_lowercase), (4,6)))

Related

getting the dtype of the elements in np array

I am working to clear and manipulate csv data in numpy. i am getting this attribute error
load_metrics("covid_sentiment_metrics.csv") gives me a numpy array as an output which is passed to the next function as parameter
def unstructured_to_structured(data, indexes):
'''structured to unstructured'''
data_1 = np.delete(data, 0, 0)
for i in range(np.shape(data_1)[1]):
if i not in indexes:
data_1[:, i] = (data_1[:, i]).astype(np.float64)
data_1 = tuple([tuple(i) for i in data_1])
return data_1
data = load_metrics("covid_sentiment_metrics.csv")
data = unstructured_to_structured(data, [0, 1, 7, 8])
print(data[5][0].dtype)
>>AttributeError: 'float' object has no attribute 'dtype'
the expected return should be float64 in this case.
Not sure what i doing wrong
** this is an addition to the previous post**
import numpy as np
import pandas as pd
# this function returns a numpy array from a CSV file
def load_metrics(filename):
'''read data as numpy array'''
df = pd.read_csv('covid_sentiment_metrics.csv').astype(str)
df.loc[-1] = df.columns.to_list()
df = df.sort_index().reset_index(drop=True)
df.drop(columns=['text', 'screen_name', 'followers', 'friends', 'user_ID', 'country_region'], inplace=True)
df_numpy = df.to_numpy()
return df_numpy
# the return from above function is passed as parameters
def unstructured_to_structured(data, indexes):
data_1 = np.delete(data, 0, 0)
# I am trying to change the data type for those columns that are not listed in the indexes
for i in range(np.shape(data_1)[1]):
if i not in indexes:
data_1[:, i] = (data_1[:, i]).astype(np.float64)
data_1 = tuple([tuple(i) for i in data_1])
return data_1
data = load_metrics("covid_sentiment_metrics.csv")
data = unstructured_to_structured(data, [0, 1, 7, 8])
print(data[5][0].dtype)
>>AttributeError: 'float' object has no attribute 'dtype'
i hope this should suffice for the situation to be clear
link to csv: https://www.dropbox.com/s/q3oq5zq71dd6sjx/covid_sentiment_metrics.csv?dl=0

apply min max scalar to multiple dataset using loop

I have n data-frames. I want to apply min- max scalar to each of them.
from sklearn import preprocessing
x = df1.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
#df11= pd.DataFrame(x_scaled)
x_scaled1 = pd.DataFrame(x_scaled, columns = df1.columns)
for one df. I tried several ways. Everything failed.
Please list out the methods you have tried for your problem.
However, you can store the dataframe objects in a list. And then you can scale each of the dataframe as you loop through the list. I am doing it for two dataframes but the same applies for n number of these.
df1 = {'col1' : np.random.randint(1,26,10) , 'col2' : np.random.randint(1,26,10)}
df2 = {'col1' : np.random.randint(1,26,10) , 'col2' : np.random.randint(1,26,10)}
df1 = pd.DataFrame(df1)
df2 = pd.DataFrame(df2)
from sklearn import preprocessing
df_list = [df1,df2]
def loop_scaler(df_list):
scaled_df_list = []
for df in df_list:
df_array = df.values
min_max_scaler = preprocessing.MinMaxScaler()
df_scaled = min_max_scaler.fit_transform(df_array)
df_scaled = pd.DataFrame(df_scaled, columns = df.columns)
scaled_df_list.append(df_scaled)
return scaled_df_list
You can print and look at the scaled output:
scaled_df_list = loop_scaler(df_list)
for scaled_df in scaled_df_list:
print(scaled_df.head(5))
or
scaled_df1, scaled_df2 = loop_scaler(df_list)
print(scaled_df1.show())
print(scaled_df2.show())
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
new_data = pd.DataFrame(data)
scaler = MinMaxScaler()
transformed_data = pd.DataFrame(scaler.fit_transform(new_data))
This may help to solve your problem.

How can make a dataset of elements of matrices in dataframe?

I have dataset of 3 parameters 'A','B','C' in .TXT file and after I print them in 24x20 matrices I need to collect the 1st elements of 'A','B','C' put in long arrays in panda dataframe and then 2nd elements of each then 3rd and so on till 480th elements.
So my data is like this in text file:
my data is txt file is following:
id_set: 000
A: -2.46882615679
B: -2.26408246559
C: -325.004619528
I already made a panda dataframe includes 3 columns of 'A','B','C' and index and defined functions to print 24x20 matric in right way. Simple example via 2x2 matrices:
1st cycle: A = [1,2, B = [4,5, C = [8,9,
3,4] 6,7] 10,11]
2nd cycle: A = [0,8, B = [1,9, C = [10,1,
2,5] 4,8] 2,7]
Reshape to this form:
A(1,1),B(1,1),C(1,1),A(1,2),B(1,2),C(1,2),.....
Result= [1,4,8,2,5,9,3,6,10,4,7,11] #1st cycle
[0,1,10,8,9,1,2,4,2,5,8,7] #2nd cycle
My scripts are following:
import numpy as np
import pandas as pd
import os
def normalize(value, min_value, max_value, min_norm, max_norm):
new_value = ((max_norm - min_norm)*((value - min_value)/(max_value - min_value))) + min_norm
return new_value
dft = pd.read_csv('D:\mc25.TXT', header=None)
id_set = dft[dft.index % 4 == 0].astype('int').values
A = dft[dft.index % 4 == 1].values
B = dft[dft.index % 4 == 2].values
C = dft[dft.index % 4 == 3].values
data = {'A': A[:,0], 'B': B[:,0], 'C': C[:,0]}
df = pd.DataFrame(data, columns=['A','B','C'], index = id_set[:,0])
#next iteration create all plots, change the number of cycles
cycles = int(len(df)/480)
print(cycles)
for cycle in range(0,10):
count = '{:04}'.format(cycle)
j = cycle * 480
for i in df:
try:
os.mkdir(i)
except:
pass
min_val = df[i].min()
min_nor = -1
max_val = df[i].max()
max_nor = 1
ordered_data = mkdf(df.iloc[j:j+480][i])
csv = print_df(ordered_data)
#Print .csv files contains matrix of each parameters by name of cycles respectively
csv.to_csv(f'{i}/{i}{count}.csv', header=None, index=None)
if 'C' in i:
min_nor = -40
max_nor = 150
#Applying normalization for C between [-40,+150]
new_value3 = normalize(df['C'].iloc[j:j+480], min_val, max_val, -40, 150)
df3 = print_df(mkdf(new_value3))
df3.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
else:
#Applying normalization for A,B between [-1,+1]
new_value1 = normalize(df['A'].iloc[j:j+480], min_val, max_val, -1, 1)
new_value2 = normalize(df['B'].iloc[j:j+480], min_val, max_val, -1, 1)
df1 = print_df(mkdf(new_value1))
df2 = print_df(mkdf(new_value2))
df1.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
df2.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
Note2: I provided a dataset in text file for 3 cycles:
Text dataset
I am not sure if I understood your question fully but this is a solution:
Convert your data frame to a 2d numpy array using as_matrix() then use ravel() to get a vector of size 480 * 3 then cycle over your cycles and use vstack method for stacking rows over each other in your result, this is a code with your example data:
A = [[1,2,3,4], [10,20,30,40]]
B = [[4,5,6,7], [40,50,60,70]]
C = [[8,9,10,11], [80,90,100,110]]
cycles = 2
for cycle in range(cycles):
data = {'A': A[cycle], 'B': B[cycle], 'C': C[cycle]}
df = pd.DataFrame(data)
D = df.as_matrix().ravel()
if cycle == 0:
Results = np.array(D)
else:
Results = np.vstack((Results, D2))
# Output: Results= array([[ 1, 4, 8, 2, 5, 9, 3, 6, 10, 4, 7, 11], [ 10, 40, 80, 20, 50, 90, 30, 60, 100, 40, 70, 110]], dtype=int64)
np.savetxt("Results.csv", Results, delimiter=",")
Is this what you wanted?

Putting a gap/break in a pyplot line plot without losing data

I have a time series with several large data gaps. I would like to see a connecting line between data points that are less than an hour apart, but not if the gap is larger. The accepted answer to the question, Put a gap/break in a line plot, would work except that you sacrifice the masked points. I would like to avoid that.
I have attempted to make a list comprehension that would insert NaNs into the array, I think that would automatically achieve the same result, but I don't seem to be able to do it correctly. The best I have found is as follows:
import datetime as dtm
import numpy as np
x = np.array([dtm.datetime(2001,4,3,0,47,30),dtm.datetime(2001,4,3,0,52,30),dtm.datetime(2001,4,3,0,57,30),dtm.datetime(2001,4,3,3,57,30),dtm.datetime(2001,4,3,4,2,30),dtm.datetime(2001,4,3,4,7,30)])
xmod = np.array([x[0]]+[dt1 if dt1-dt0 < dtm.timedelta(hours=1.) else [dt1,np.nan] for dt1, dt0 in zip(x[1:],x[:-1])])
This gives the result:
In [7]: xmod
Out[7]:
array([datetime.datetime(2001, 4, 3, 0, 47, 30),
datetime.datetime(2001, 4, 3, 0, 47, 30),
datetime.datetime(2001, 4, 3, 0, 52, 30),
[datetime.datetime(2001, 4, 3, 0, 57, 30), nan],
datetime.datetime(2001, 4, 3, 3, 57, 30),
datetime.datetime(2001, 4, 3, 4, 2, 30)], dtype=object)
I have not been able to find a way to insert both the data point and the np.nan without putting brackets around them. Is this possible? Is there a better way to achieve my goal? Thanks!
In accordance with the comment above, probably the easiest way to do this would be to separate the data into groups where you need the gaps. Here is one way to implement such a thing.
import datetime as dtm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
x = np.array([dtm.datetime(2001,4,3,0,47,30),dtm.datetime(2001,4,3,0,52,30),dtm.datetime(2001,4,3,0,57,30),
dtm.datetime(2001,4,3,3,57,30),dtm.datetime(2001,4,3,4,2,30),dtm.datetime(2001,4,3,4,7,30)])
y = range(len(x))
# make a dataframe with groups separated that are over an hour apart
data = []
g = 0
for i in range(len(x)):
x0 = x[i]
y0 = y[i]
if i < (len(x)-1):
x1 = x[i+1]
td = x1 - x0
elapsed_seconds = td.total_seconds()
hrs = (elapsed_seconds/60)/60
if hrs < 1:
data.append([x0,y0, g])
else:
data.append([x0,y0, g])
g+=1
else:
data.append([x0,y0, g])
df = pd.DataFrame(data, columns=['x', 'y', 'group'])
# draw a plot
fig, ax = plt.subplots(1,1, figsize = (8,5))
for i, dfg in df.groupby('group'):
ax.plot(dfg['x'], dfg['y'], c='b')
So, I accepted the answer by djakubosky because it seems clean and is probably the right approach. However, by the time that answer was posted, I had decided that what I was doing was inappropriate for a list comprehension and simply wrote it as a for loop - and that worked fine. Possibly this will be useful to someone else. Here is the code:
def insert_breaks(x,y):
import datetime as dtm
import numpy as np
xnew = []
ynew = []
for dt1, dt0, y1, y0 in zip(x[1:],x[:-1],y[1:],y[:-1]):
if dt1-dt0 < dtm.timedelta(hours=1):
xnew+=[dt0]
ynew+=[y0]
else:
xnew+=[dt0,dt0+(dt1-dt0)/2]
ynew+=[y0, np.nan]
xnew+=[dt1]
ynew+=[y1]
return xnew, ynew

Change the Value of Specific Columns Based on a Condition in a DataFrame Python

I am not sure how to automate it for specific set of columns.
It will go into a specific column, check the values. If a value is equal or larger than 10, it would replace it with 100, if it is less than that it will be 0. But then it would do that for all the columns rather than specifying the name of the column each time.
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000,
n_features=6,
n_informative=3,
n_classes=2,
random_state=0,
scale = 10,
shuffle=False)
# Creating a dataFrame
df = pd.DataFrame({'car':X[:,0],
'ball':X[:,1],
'Feature 3': 5,
'Feature 4':X[:,3],
'Feature 5':X[:,4],
'Feature 6':X[:,5],
'Class':y})
df.loc[df['Feature 6'] > 10, 'Feature 6'] = 100
and the set of columns will be extracted using the following line
spike_cols = [col for col in df.columns if "tu" in str(col)]
So basically change the values of columns containing tu anywhere in the column name
I think you need numpy.where with a DataFrame constructor, because it returns a 2d numpy array:
df = pd.DataFrame(np.where(df >= 10, 100, 0), columns=df.columns, index=df.index)
Thank you #Wen for solution with DataFrame.ge and multiple boolean mask by 100, Trues are processes like 1 and Falses like 0:
df = df.ge(10).mul(100)
EDIT: If want applied solution only for some columns:
df[spike_cols] = pd.DataFrame(np.where(df[spike_cols] >= 10, 100, 0),
columns=spike_cols, index=df.index)
df[spike_cols] = df[spike_cols].ge(10).mul(100)

Categories

Resources