Getting feature names after one-hot encoding

Getting feature names after one-hot encoding - python

I have a dataset that I've recently transformed through one-hot encoding and used it trained a lasso logistic regression on it. I'm trying to get a list of the non-zero coefficients. I can get a list of the coefficients through sklearn but I'm not sure how to map them back to the data after one hot encoding.
A small excerpt of the dataset (pre one hot encoding is below)
{'acc_now_delinq': {29601: 0.0,
143234: 0.0,
157345: 0.0,
158754: 0.0,
229042: 0.0},
'application_type': {29601: 0, 143234: 0, 157345: 0, 158754: 0, 229042: 0},
'collections_12_mths_ex_med': {29601: 0.0,
143234: 0.0,
157345: 0.0,
158754: 0.0,
229042: 0.0},
'credit_age': {29601: 118.0,
143234: 157.0,
157345: 213.0,
158754: 269.0,
229042: 240.0},
'delinq_2yrs': {29601: 0.0,
143234: 0.0,
157345: 0.0,
158754: 0.0,
229042: 0.0},
'dti': {29601: 2.0600000000000001,
143234: 23.710000000000001,
157345: 18.960000000000001,
158754: 18.690000000000001,
229042: 22.530000000000001},
'emp_length_num': {29601: 8.0,
143234: 2.0,
157345: 1.0,
158754: 7.0,
229042: 1.0},
'home_ownership': {29601: 4, 143234: 5, 157345: 5, 158754: 1, 229042: 1},
'inq_last_6mths': {29601: 2.0,
143234: 0.0,
157345: 0.0,
158754: 0.0,
229042: 0.0},
'loan_amnt': {29601: 214.0,
143234: 211.0,
157345: 571.0,
158754: 937.0,
229042: 466.0},
'loan_status': {29601: 0, 143234: 1, 157345: 0, 158754: 1, 229042: 1},
'log_annual_inc': {29601: 11.225243392499999,
143234: 10.8022251252,
157345: 11.0020998412,
158754: 11.6952470218,
229042: 11.225243392499999},
'open_acc': {29601: 5.0,
143234: 21.0,
157345: 11.0,
158754: 9.0,
229042: 14.0},
'pub_rec': {29601: 0.0, 143234: 0.0, 157345: 0.0, 158754: 0.0, 229042: 0.0},
'purpose': {29601: 4, 143234: 2, 157345: 2, 158754: 2, 229042: 2},
'revol_bal': {29601: 2266.0,
143234: 12254.0,
157345: 20657.0,
158754: 11367.0,
229042: 39404.0},
'revol_inc_ratio': {29601: 0.030213333333299997,
143234: 0.24941990637100001,
157345: 0.34428333333300004,
158754: 0.094725000000000004,
229042: 0.52538666666699996},
'revol_util': {29601: 44.0,
143234: 89.400000000000006,
157345: 76.900000000000006,
158754: 81.200000000000003,
229042: 95.5},
'tot_coll_amt': {29601: 0.0,
143234: 0.0,
157345: 0.0,
158754: 0.0,
229042: 0.0},
'tot_cur_bal': {29601: 2266.0,
143234: 115947.0,
157345: 80598.0,
158754: 347695.0,
229042: 355741.40000000002},
'total_acc': {29601: 5.0,
143234: 41.0,
157345: 35.0,
158754: 17.0,
229042: 30.0},
'total_rev_hi_lim': {29601: 5100.0,
143234: 13700.0,
157345: 26900.0,
158754: 14000.0,
229042: 80780.0},
'verification_status': {29601: 0, 143234: 2, 157345: 1, 158754: 2, 229042: 1}}
And my one-hot encoding code:
def one_hot(df):
# Categorical columns for use in one-hot encoder
categorical = (df.dtypes.values != np.dtype('float64'))
print categorical
# Get numpy array from data
x = df.values[:, :-1]
y = df.values[:, -1]
# Apply one hot endcoing
encoder = preprocessing.OneHotEncoder(categorical_features=categorical[:-1], sparse=False) # Last value in mask is y
x = encoder.fit_transform(x)
return x, y

Assuming you have your small excerpt of the dataset stored in a variable called temp:
temp = pd.DataFrame(temp)
categorical = (temp.dtypes.values != np.dtype('float64'))
categorical = temp.columns[categorical]
def one_hot(temp, categorical):
# temp is the data frame from which the "categorical" columns need to be One hot encoded
from sklearn.preprocessing import OneHotEncoder
enc_model = OneHotEncoder(sparse=False)
X = enc_model.fit_transform(temp[categorical])
uniq_vals = temp[categorical].apply(lambda x: x.value_counts()).unstack()
uniq_vals = uniq_vals[~uniq_vals.isnull()]
enc_cols = list(uniq_vals.index.map('{0[0]}_{0[1]}'.format)) # https://stackoverflow.com/questions/41987743/merge-two-multiindex-levels-into-one-in-pandas
enc_df = pd.DataFrame(X, columns=enc_cols, index=temp.index, dtype='bool')
return(enc_df)

Related

Convert *.csv file in DB with repeat second header several times

I have *.csv file that looks:
# time;P_O2, atm;P_He, atm;Flow O2, l/min;Flow He, l/min;FiO2 Env, %;FiO2 sens2, %;P mask, cm H2O;Tmask, gradC;Tnagr, gradC;V, ml;f, 1/min;Tzad, gradC;FiO2 zad, %;Flags;
# POWERON 01.11.2018 15:02:29
1;39;33;0;2;0;0;255;135;135;0;0;0;25;83886592;0
5;39;33;0;2;0;0;255;135;135;0;0;0;25;83886624;0
26;0;0;0;8;529;0;255;135;135;0;0;0;25;83886592;0
72;0;0;0;8;598;0;248;135;135;0;0;0;25;83886085;0
# POWERON 01.11.2018 15:04:02
1;0;0;0;7;0;0;255;135;135;0;0;0;25;83886592;0
2;0;0;0;7;113;0;255;135;135;0;0;0;25;83886085;0
# POWERON 01.11.2018 15:04:48
1;0;0;0;6;0;0;255;135;135;0;0;0;25;83886592;0
2;0;0;0;6;115;0;255;135;135;0;0;0;25;83886085;0
So, I try convert it to DB and make this one:
import sqlite3
import pandas as pd
conn = sqlite3.connect('mydb.db')
stud_data = pd.read_csv(r'Log/20181101.LOG', sep=';', engine='python')
stud_data.to_sql('interation', conn, if_exists='replace', index=False)
cur = conn.cursor()
for row in cur.execute('SELECT * FROM interation'):
print(row)
conn.close()
but my result is:
('# POWERON 01.11.2018 15:02:29', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
('1', 39.0, 33.0, 0.0, 2.0, 0.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886592.0, 0.0)
('5', 39.0, 33.0, 0.0, 2.0, 0.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886624.0, 0.0)
('26', 0.0, 0.0, 0.0, 8.0, 529.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886592.0, 0.0)
('72', 0.0, 0.0, 0.0, 8.0, 598.0, 0.0, 248.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886085.0, 0.0)
('# POWERON 01.11.2018 15:04:02', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
('1', 0.0, 0.0, 0.0, 7.0, 0.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886592.0, 0.0)
('2', 0.0, 0.0, 0.0, 7.0, 113.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886085.0, 0.0)
('# POWERON 01.11.2018 15:04:48', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
('1', 0.0, 0.0, 0.0, 6.0, 0.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886592.0, 0.0)
I need value (date) in 'POWERON' insert second columns each other iteration like this:
POWERON; time; P_O2, atm; P_He, atm; ....
01.11.2018 15:02:29 1; 39; 33; ...
01.11.2018 15:02:29 5; 39; 33; ...
How I may do it simple? May be with pd.read_csv (some option?) or I must use some iteration on DB?
It's my first question, sorry for any mistakes design errors.

Use pd.read_csv is not sufficient to extract the datetime in comments. You have to parse the csvfile yourself. Furthermore, the number of column of header and data are unbalanced.
import sqlite3
import io
def load_csvfile(filepath):
with open(filepath) as csvfile:
data = io.StringIO()
line = csvfile.readline()
line = f"POWERON;{line.rsplit(';', 1)[0]}\n"
data.writelines([line])
for line in csvfile.readlines():
if line.startswith('# POWERON'):
dt = line[10:].strip()
else:
line = f"{dt};{line.rsplit(';', 1)[0]}\n"
data.writelines([line])
data.seek(0)
return pd.read_csv(data, sep=';', parse_dates=['POWERON'])
conn = sqlite3.connect('mydb.db')
stud_data = load_csvfile(r'Log/20181101.LOG')
stud_data.to_sql('interation', conn, if_exists='replace', index=False)
cur = conn.cursor()
for row in cur.execute('SELECT * FROM interation'):
print(row)
conn.close()
('2018-01-11 15:02:29', 1, 39, 33, 0, 2, 0, 0, 255, 135, 135, 0, 0, 0, 25, 83886592)
('2018-01-11 15:02:29', 5, 39, 33, 0, 2, 0, 0, 255, 135, 135, 0, 0, 0, 25, 83886624)
('2018-01-11 15:02:29', 26, 0, 0, 0, 8, 529, 0, 255, 135, 135, 0, 0, 0, 25, 83886592)
('2018-01-11 15:02:29', 72, 0, 0, 0, 8, 598, 0, 248, 135, 135, 0, 0, 0, 25, 83886085)
('2018-01-11 15:04:02', 1, 0, 0, 0, 7, 0, 0, 255, 135, 135, 0, 0, 0, 25, 83886592)
('2018-01-11 15:04:02', 2, 0, 0, 0, 7, 113, 0, 255, 135, 135, 0, 0, 0, 25, 83886085)
('2018-01-11 15:04:48', 1, 0, 0, 0, 6, 0, 0, 255, 135, 135, 0, 0, 0, 25, 83886592)
('2018-01-11 15:04:48', 2, 0, 0, 0, 6, 115, 0, 255, 135, 135, 0, 0, 0, 25, 83886085)
Note: you will receive an UserWarning about the spaces in column names.

VaderSentiment: emoji analyzer does not work in Jupyter Notebook

I am trying to do some sentiment analysis on r/wallstreetbets content and would also like to use the meaning of emojis.
Here is my code:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
wsb_lingo = {
"bullish": 4.0,
"bearish": -4.0,
"bagholder": -4.0,
"BTFD": 4.0,
"FD": 4.0,
"diamond hands": 0.0,
"paper hands": 0.0,
"DD": 4.0,
"GUH": -4.0,
"pump": 4.0,
"dump": -4.0,
"gem stone": 4.0, # emoji
"rocket": 4.0, # emoji
"andromeda": 0.0,
"to the moon": 4.0,
"stonks": -4.0,
"tendies": 4.0,
"buy": 4.0,
"sell": -4.0,
"hold": 4.0,
"short": 4.0,
"long": 4.0,
"overvalued": -4.0,
"undervalued": 4.0,
"calls": 4.0,
"call": 4.0,
"puts": -4.0,
"put": -4.0,
}
sid = SentimentIntensityAnalyzer()
sid.lexicon.update(wsb_lingo)
# Test
print(sid.polarity_scores('🚀'))
print(sid.polarity_scores('😄'))
The output is given below:
{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
How is it possible that it's unable to give any sentiment for emojis (e.g., due to Jupyter Notebook)? Am I forgetting something here? All libraries are up-to-date.

If I use vaderSentiment instead of nltk.sentiment.vader it works for me
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
new = { "rocket": 4.0 }
sia = SentimentIntensityAnalyzer()
sia.polarity_scores('🚀')
# Outputs: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
sia.lexicon.update(new)
sia.polarity_scores('🚀')
# Outputs: {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.7184}
See also this issue

How would I be able to find the average from a list of lists given two constaints?

So I have a list of lists, where the 7th index of each sublist contains the value I am interested in averaging, however, the numbers must be averaged according to their type. This type to be matched upond can be found at the 11th index of the sublist.
Below is some code I wrote. In this exa
# Open the csv file
opened_file = open('AppleStore.csv')
from csv import reader
read_file = reader(opened_file)
# Store the data as a list or arrays
apps_data = list(read_file)
# idx_num = index number of interest
# list_doc = the list of lists
# row_start = 1
def extract(idx_num,list_doc,row_start=1):
a_list = []
for row in list_doc[row_start:]:
var = row[idx_num]
a_list.append(var)
return a_list
# Use the extract function to get an array
a_list = extract(11, apps_data, 0)
# Find unique elements
a_list_set = set(a_list)
# Create a dictionary with initial values at [0,0]
dic = dict.fromkeys(a_list_set,[0,0])
print(dic)
# Works as intended
#{'Weather': [0, 0], 'Sports': [0, 0], 'Productivity': [0, 0], 'Games': [0, #0], 'News': [0, 0], 'Finance': [0, 0], 'Education': [0, 0], #'Entertainment': [0, 0], 'Health & Fitness': [0, 0], 'Business': [0, 0], #'Social Networking': [0, 0], 'prime_genre': [0, 0], 'Photo & Video': [0, #0], 'Navigation': [0, 0], 'Music': [0, 0], 'Medical': [0, 0], 'Travel': #[0, 0], 'Reference': [0, 0], 'Shopping': [0, 0], 'Utilities': [0, 0], #'Food & Drink': [0, 0], 'Lifestyle': [0, 0], 'Catalogs': [0, 0], 'Book': #[0, 0]}
for row in apps_data[1:]:
price = float(row[4])
genre = row[11]
# Here is the issue:
# I thought that this would allow for the genre instance to be matched to the appropriate key and then I could append my values.
if genre in dic.keys():
dic[genre][0] += 1
dic[genre][1] += (price)
else:
dic[genre][0] = 1
dic[genre][1] = price
print(dic)
## From here I would extract the array contents of the dictionary
for genre in a_list_set:
print(str(genre) + " mean price:" + str(round(dic[genre][1]/dic[genre][0], 2)))
I got this instead.
{'Weather': [7197, 12423.58999999945], 'Sports': [7197, 12423.58999999945], 'Productivity': [7197, 12423.58999999945], 'Games': [7197, 12423.58999999945], 'News': [7197, 12423.58999999945], 'Finance': [7197, 12423.58999999945], 'Education': [7197, 12423.58999999945], 'Entertainment': [7197, 12423.58999999945], 'Health & Fitness': [7197, 12423.58999999945], 'Business': [7197, 12423.58999999945], 'Social Networking': [7197, 12423.58999999945], 'prime_genre': [7197, 12423.58999999945], 'Photo & Video': [7197, 12423.58999999945], 'Navigation': [7197, 12423.58999999945], 'Music': [7197, 12423.58999999945], 'Medical': [7197, 12423.58999999945], 'Travel': [7197, 12423.58999999945], 'Reference': [7197, 12423.58999999945], 'Shopping': [7197, 12423.58999999945], 'Utilities': [7197, 12423.58999999945], 'Food & Drink': [7197, 12423.58999999945], 'Lifestyle': [7197, 12423.58999999945], 'Catalogs': [7197, 12423.58999999945],'Book': [7197, 12423.58999999945]}

We can do this with itertools.groupby; first, we extract the "columns" of concern from our data, constituting the 7th and 11th value of each row, into subset, also sorting by the 11th value.
Then, we use groupby to partition our subset into groups, where each group's members all have the same 2nd element (the original 11th element). We can then use a dict comprehension to get the mean of the 1st element of each group's members.
from itertools import groupby
from operator import itemgetter
from statistics import mean
subset = sorted(((row[6], row[10]) for row in data), key=itemgetter(1))
result = {key: mean(map(itemgetter(0), group)) for key, group in groupby(subset, itemgetter(1))}
print(result)
Some sample data:
[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.926456602181107, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.261928508086729, 0.0, 0.0, 0.0, 'that'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6.582427615396794, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.08345371286375847, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6323414510835206, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -7.755177634382969, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -5.948058847184649, 0.0, 0.0, 0.0, 'that'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -5.767820549798114, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.609131600539092, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.2106567350536854, 0.0, 0.0, 0.0, 'that'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -3.1550716372338297, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.6037278107842077, 0.0, 0.0, 0.0, 'that'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -11.819322083983815, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.441817745217389, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4961079817344718, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.269603775378254, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.42023137240633596, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.855652365179269, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -8.048026683773955, 0.0, 0.0, 0.0, 'that'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.577046681982131, 0.0, 0.0, 0.0, 'this']]
And the result:
{'other': 0.585667907075492,
'that': -3.530217022955171,
'this': -0.9035005758618025}

Python: ValueError: setting an array element with a sequence

I'm trying to use scikit-learn to do some ML.
I am using the preprocessing module to prep my data. The data are of type float.
From reading other questions regarding this issue: ValueError: setting an array element with a sequence, it's either due to wrong structure of my data or because my data is of type string. Neither seem to be the case here.
Please let me know if you have any idea how to solve this issue or what it even means. Thank you.
The code:
print(X)
pred_X = np.array(pred_X)
pred_Y = np.array(pred_Y)
X = np.array(X)
Y = np.array(Y)
X = preprocessing.scale(X)
pred_X = preprocessing.scale(pred_X)
print(x):
[[547180.0, 120.0, 113.0, 456701.0, 1.0, 6.43, -1.0, 0.313, 0.42, 0.267 3.0, 11800.0, 607208.0, 120.0, 113.0, 456701.0, 1.0, 0.273, 0.331, 0.154, 6.0, 10300.0, 458015.0, 113.0, 120.0, 45328 6.0, 1.0, 2.54, -1.0, 0.32, 0.443, 0.257, 3.0, 92000.0, 543685.0, 120.0, 113.0, 456701.0, 1.0, 6.43, 1.0, 0.296, 0.4, 0.234, 2.0, 8800.0, 594809.0, 475582.0, 120.0, 113.0, 456701.0, 1.0, 1.0, 0.295, 0.384, 0.264, 4.0, 7700.0],
[547180.0, 120.0, 113.0, 456701.0, 1.0, 6.43, -1.0, 0.313, 0.42, 0.267, 3.0, 11800.0, 607208.0, 120.0, 113.0, 456701.0, 1.0, 0.273, 0.331, 0.154, 6.0, 10300.0, 458015.0, 113.0, 120.0, 453286.0, 1.0, 2.54, -1.0, 0.32, 0.443, 0.257, 3.0, 92000.0, 543685.0, 120.0, 113.0, 456701.0, 1.0, 6.43, 1.0, 0.296, 0.4, 0.234, 2.0, 8800.0, 594809.0, 435062.0, 120.0, 113.0, 456701.0, 1.0, 1.0, 0.312, 0.364, 0.154, 5.0, 6900.0],
[547180.0, 120.0, 113.0, 456701.0, 1.0, 6.43, -1.0, 0.313, 0.42, 0.267, 3.0, 11800.0, 607208.0, 120.0, 113.0, 456701.0, 1.0, 0.273, 0.331, 0.154, 6.0, 10300.0, 458015.0, 113.0, 120.0, 453286.0, 1.0, 2.54, -1.0, 0.32, 0.443, 0.257, 3.0, 92000.0, 543685.0, 120.0, 113.0, 456701.0, 1.0, 6.43, 1.0, 0.296, 0.4, 0.234, 2.0, 8800.0, 594809.0, 446308.0, 120.0, 113.0, 456701.0, 1.0, 0.0, 0.221, 0.28e, 0.115, 8.0, 6400.0]]
The Error:
Traceback (most recent call last):
File "sampleSVM.py", line 46, in <module>
X = preprocessing.scale(X)
File "/home/user/.local/lib/python3.5/site-packages/sklearn/preprocessing/data.py", line 133, in scale
dtype=FLOAT_DTYPES)
File "/home/user/.local/lib/python3.5/site-packages/sklearn/utils/validation.py", line 433, in check_array
array = np.array(array, dtype=dtype, order=order, copy=copy)
ValueError: setting an array element with a sequence.

Your input array X is malformed. There are 59 elements in row 1, and 58 in rows 2 & 3. When you convert to a numpy array it becomes an array of shape (3,) with dtype=Object.
The solution is to check and fix your input data. Each row in X must be the same length.

How to create a list from existing list in python

I am having a list in below format. How can i create another list from the existing one with just selected elements.
[{'UserDiscount': 0.0, 'CostTotalInvEOPAmount': 940.0, 'WeekEndingData': u'2016-10-08', 'WeeksOnHand': 0.0, 'UnitTotalInvEOPQuantity': 250.0, 'WeeksOfSales': 0.0, 'UnitCostAmount': 3.76, 'Week': u'2016 Wk 36', 'CostReceiptAmount': 940.0, 'UnitSalesQuantity': 0.0, 'UnitReceiptQuantity': 250.0, 'Demand': 0.0, 'InventoryBOP': 0.0, 'PEMDiscount': 0.0, 'ElasticLift': 0.0, 'StoreCount': 0, 'PriceStatus': 4, 'UnitOnOrderQuantity': None, 'ReceiptSizeContributions': [{u'sizeId': u'e1656ac7-1cc1-40ce-b485-989bba9d758d', u'contribution': 1.0}], 'CostSalesAmount': 0.0, 'LifeCycleProperties': {u'IsAtRegularPrice': False, u'IsAtMarkdown': False, u'IsFinished': False, u'IsPreSeason': True}, 'MardownDiscount': 0.0, 'RecommendedReceipt': 250.0, 'RecommendedReceiptSizeContributions': [{u'sizeId': u'e1656ac7-1cc1-40ce-b485-989bba9d758d', u'contribution': 1.0}], 'UnitTotalInvBOPQuantity': 0.0, 'CostOnOrderAmount': None, 'InventoryEOP': 250.0, 'CostTotalInvBOPAmount': 0.0, 'Receipt': 250.0, 'Sales': 0.0, 'LostSales': 0.0, 'TotalDiscount': 0.0, 'RetailSalesAmount': 0.0},
{'UserDiscount': 0.0, 'CostTotalInvEOPAmount': 940.0, 'WeekEndingData': u'2016-10-15', 'WeeksOnHand': 0.0, 'UnitTotalInvEOPQuantity': 250.0, 'WeeksOfSales': 15.784951285314385, 'UnitCostAmount': 3.76, 'Week': u'2016 Wk 37', 'CostReceiptAmount': 0.0, 'UnitSalesQuantity': 0.0, 'UnitReceiptQuantity': 0.0, 'Demand': 0.0, 'InventoryBOP': 250.0, 'PEMDiscount': 0.0, 'ElasticLift': 0.0, 'StoreCount': 0, 'PriceStatus': 4, 'UnitOnOrderQuantity': None, 'ReceiptSizeContributions': [], 'CostSalesAmount': 0.0, 'LifeCycleProperties': {u'IsAtRegularPrice': False, u'IsAtMarkdown': False, u'IsFinished': False, u'IsPreSeason': True}, 'MardownDiscount': 0.0, 'RecommendedReceipt': 0.0, 'RecommendedReceiptSizeContributions': [], 'UnitTotalInvBOPQuantity': 250.0, 'CostOnOrderAmount': None, 'InventoryEOP': 250.0, 'CostTotalInvBOPAmount': 940.0, 'Receipt': 0.0, 'Sales': 0.0, 'LostSales': 0.0, 'TotalDiscount': 0.0, 'RetailSalesAmount': 0.0}]
My new list will having below elements.
[{'UserDiscount': 0.0, 'CostTotalInvEOPAmount': 940.0, 'WeekEndingData': u'2016-10-08', 'WeeksOnHand': 0.0, 'UnitTotalInvEOPQuantity': 250.0, 'WeeksOfSales': 0.0, 'UnitCostAmount': 3.76, 'Week': u'2016 Wk 36', 'CostReceiptAmount': 940.0, 'UnitSalesQuantity': 0.0, 'UnitReceiptQuantity': 250.0, 'Demand': 0.0, 'InventoryBOP': 0.0, 'PEMDiscount': 0.0, 'ElasticLift': 0.0, 'StoreCount': 0, 'PriceStatus': 4, 'UnitOnOrderQuantity': None, 'CostSalesAmount': 0.0, 'RecommendedReceipt': 250.0, 'RetailSalesAmount': 0.0},
{'UserDiscount': 0.0, 'CostTotalInvEOPAmount': 940.0, 'WeekEndingData': u'2016-10-15', 'WeeksOnHand': 0.0, 'UnitTotalInvEOPQuantity': 250.0, 'WeeksOfSales': 15.784951285314385, 'UnitCostAmount': 3.76, 'Week': u'2016 Wk 37', 'CostReceiptAmount': 0.0, 'UnitSalesQuantity': 0.0, 'UnitReceiptQuantity': 0.0, 'Demand': 0.0, 'InventoryBOP': 250.0, 'PEMDiscount': 0.0, 'ElasticLift': 0.0, 'StoreCount': 0, 'PriceStatus': 4, 'UnitOnOrderQuantity': None, 'CostSalesAmount': 0.0, 'RecommendedReceipt': 0.0, 'RetailSalesAmount': 0.0}]

You have a list with two dictionaries. To filter the dictionaries you can try
keep=[key1,key2] #keys you wanna keep
newList = []
for item in mylist:
d = dict((key,value) for key, value in item.iteritems() if key in keep)
newlist.append(d)
del mylist
Also using funcy you can do a
import funcy
mydict={1:1,2:2,3:3}
keep=[1,2]
funcy.project(mydict,keep)
=> {1: 1, 2: 2}
which is much prettier imho.

You could use the list comprehension https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions
[l for l in your_list if l['UserDiscount'] >= 1 ]
[{'UserDiscount': l['UserDiscount'],'CostTotalInvEOPAmount': l['CostTotalInvEOPAmount']} for l in your_list ]
Using this way you can filter the elements in your list and change the structure of your dicts in the list

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Getting feature names after one-hot encoding - python

Related

Convert *.csv file in DB with repeat second header several times

VaderSentiment: emoji analyzer does not work in Jupyter Notebook

How would I be able to find the average from a list of lists given two constaints?

Python: ValueError: setting an array element with a sequence

How to create a list from existing list in python

Categories

Resources