Saved data using pandas is changing - python

I'm trying to save some data using pandas as csv and then I'm gonna load and use those data in somewhere else. This is my code for saving data
shuffledInteractionsFirstSeq = [[11, 15, 19, 9, 6, 8, 15, 9, 12, 21, 14, 7, 10, 20, 15, 6, 6, 17, 10, 10, 10, 6, 11, 10, 11, 8, 2, 16, 1, 19, 4, 9, 10, 20, 19, 17, 19, 21, 21, 6, 19, 13, 19, 20, 9, 4, 1, 17, 17, 17, 10, 5, 2, 1, 16, 3, 1, 9, 1, 21, 3, 17, 4, 19, 7, 12, 19, 20, 1, 17, 7, 1, 2, 19, 13, 17, 3, 13, 12, 13, 14, 4, 8, 19, 10, 4, 12, 19, 17, 4, 12, 5, 12, 11, 20, 9, 12, 12, 11, 19, 4, 14, 11, 7, 4, 3, 8, 8, 16, 10, 20, 3, 14, 16, 10, 9, 13, 2, 19, 9, 10, 17, 13, 10, 2, 19, 17, 10, 7, 2, 17, 12, 10, 9, 12, 1, 17, 12, 17, 9, 16, 16, 12, 20, 9, 4, 11, 3, 15, 6, 4, 8, 9, 12, 2, 16, 5, 9, 19, 17, 17, 16, 8, 15, 12, 9, 11, 14, 9, 4, 21, 1, 10, 5, 21, 9, 10, 3, 19, 19, 13, 8, 3, 12, 3, 12, 17, 16, 21, 9, 10, 8, 12, 2, 12, 17, 16, 19, 8, 17, 14, 1, 2, 13, 9, 19, 16, 5, 4, 13, 8, 13, 8, 7, 21, 2, 1, 13, 1, 6, 5, 1, 8, 10, 9, 2, 12, 3, 9, 9, 5, 12, 6, 16, 6, 13, 2, 17, 12, 19, 16, 17, 19, 14, 2, 17, 7, 6, 8, 15, 13, 19, 19, 16, 17, 14, 10, 10, 10, 12, 6, 16, 10, 1, 4, 4, 6, 19, 19, 8, 15, 16, 4, 12, 5, 17, 3, 12, 1, 9, 17, 8, 8, 19, 14, 10, 9, 4, 16, 19, 4, 8, 12, 2, 17, 15, 13, 12, 12, 12, 17, 15, 9, 16, 8, 17, 8, 6, 13, 6, 15, 1, 5, 21, 1, 17, 6, 3, 8, 8, 6, 3, 8, 15, 14, 1, 7, 2, 12, 8, 16, 6, 4, 9, 20, 12, 12, 17, 10, 9, 14, 8, 19, 17, 9, 10, 14, 1, 14, 5, 6, 12, 9, 17, 8, 19, 5, 9, 14, 16, 16, 6, 6, 3, 13, 4, 8, 19, 11, 7, 16, 5, 12, 2, 6, 6, 4, 5, 5, 21, 2, 12, 16, 17, 14, 10, 5, 12, 16, 17, 20, 12, 12, 17, 8, 6, 13, 12, 12, 17, 12, 6, 17, 8, 17, 10, 13, 2, 15, 8, 9, 14, 8, 8, 12, 15, 20, 14, 4, 19, 6, 9, 1, 11, 21, 1, 13, 13, 8, 15, 6, 14, 8, 15, 2, 16, 16, 12, 8, 17, 6, 10, 10, 10, 17, 15, 3, 6, 6, 9, 4, 8, 16, 12, 17, 17, 4, 8, 5, 15, 13, 6, 6, 6, 3, 11, 15, 3, 12, 20, 15, 16, 4, 10, 21, 9, 21, 9, 19, 19, 9, 8, 4, 13, 10, 6, 19, 1, 13, 17, 9, 1, 9, 15, 15, 19, 19, 14, 15, 4, 9, 15, 1, 19, 17, 10, 6, 1, 11, 5, 10, 6, 5, 10, 6, 1, 1, 6, 16, 17, 11, 6, 1, 15, 16, 10, 17, 10, 17, 19, 14, 1, 15, 14, 10, 10, 16, 6, 8, 19, 14, 14, 14, 12, 12, 10, 10, 15, 1, 8, 4, 1, 14, 14, 7, 10, 10, 14, 10, 17, 19, 20, 6, 8, 9, 14, 10, 14, 1, 15, 19, 10, 1, 19, 4, 15, 21, 10, 9, 3, 14, 14, 10, 10, 6, 8, 20, 6, 2, 16, 6, 9, 10, 8, 2, 17, 17, 1, 19, 13, 20, 12, 1, 16, 20, 16, 12, 9, 16, 10, 3, 14, 8, 20, 12, 12, 11, 17, 20, 11, 4, 20, 4, 15, 4, 8, 3, 12, 21, 17, 12, 10, 8, 21, 17, 10, 8, 4, 4, 16, 14, 12, 14, 14, 4, 9, 12, 4, 14, 4, 10, 10, 4, 10, 3, 9, 20, 1, 16, 10, 20, 12, 20, 5, 3, 8, 16, 9, 20, 10, 20, 21, 8, 9, 8, 5, 8, 11, 8, 19, 6, 6, 10, 19, 6, 10, 15, 8, 19, 5, 17, 19, 10, 16, 8, 19, 12, 15, 19, 15, 14, 6, 21, 16, 13, 10, 16, 5, 14, 17, 15, 5, 13, 1, 13, 15, 6, 13, 3, 15, 13, 4, 6, 8, 4, 4, 4, 6, 6, 4, 15, 3, 15, 3, 15, 16, 16, 13, 10, 19, 7, 6, 10, 10, 1, 10, 8, 20, 3, 3, 10, 15, 16, 10, 2, 10, 5, 16, 21, 7, 15, 10, 15, 3, 10, 8, 10, 8, 1, 1, 15, 8, 19, 4, 10, 10, 6, 15, 15, 6, 20, 4, 1, 10, 9, 21, 20, 6, 12, 10, 10, 14, 21, 20, 8, 14, 4, 10, 9, 12, 16, 1, 19, 16, 10, 5, 3, 1, 8, 1, 8, 1, 19, 1, 4, 6, 17, 3, 15, 8, 8, 4, 19, 1, 14, 15, 8, 6, 15, 1, 5, 10, 7, 8, 13, 15, 15, 8, 15, 14, 6, 5, 4, 15, 1, 10, 10]]
shuffledInteractionsSecondSeq = [[11, 15, 17, 10, 1, 8, 10, 1, 1, 8, 10, 10, 19, 1, 10, 14, 1, 14, 1, 4, 13, 10, 14, 1, 15, 1, 3, 4, 19, 1, 1, 1, 13, 4, 14, 8, 1, 1, 3, 8, 13, 4, 19, 19, 19, 16, 10, 1, 20, 3, 4, 16, 10, 1, 13, 9, 7, 13, 6, 16, 15, 9, 12, 11, 1, 2, 21, 2, 15, 8, 13, 1, 2, 8, 1, 6, 4, 15, 15, 21, 6, 17, 2, 8, 21, 14, 6, 15, 10, 20, 1, 5, 2, 2]]
print(max([len(seq) for seq in shuffledInteractionsFirstSeq]))
# 847
print(max([len(seq) for seq in shuffledInteractionsSecondSeq]))
# 94
DataFrame(data={'Sequence1' : shuffledInteractionsFirstSeq, 'Sequence2' : shuffledInteractionsSecondSeq}).to_csv('interactionDataSet.csv', index = False)
interactionsCSV = read_csv('interactionDataSet.csv')
interactionsSequence1 = list(interactionsCSV.get('Sequence1'))
interactionsSequence2 = list(interactionsCSV.get('Sequence2'))
print(max([len(seq) for seq in interactionsSequence1]))
# 3026
print(max([len(seq) for seq in interactionsSequence2]))
# 327
as you can see the data is changing after saving. I know maybe I'm doing something wrong but I couldn't figure it out

This occures because read_csv doesn't recognize complex types like list and reads them as strings:
type(interactionsCSV.at[0, 'Sequence1'])
# <class 'str'>
One possible work around is to use pandas.eval function:
interactionsCSV['Sequence1'] = pd.eval(interactionsCSV['Sequence1'])
type(interactionsCSV.at[0, 'Sequence1'])
# <class 'list'>
max([len(s) for s in interactionsCSV.get('Sequence1')])
# 847

Related

Python list of objects sharing reference to other objects?

I have the following code, which on first glance should produce 10 Jobs with 3 Tasks each.
class Job:
id = None
tasks = {}
class Task:
id = None
cnt = 0
jobs = []
for i in range(0, 10):
job = Job()
job.id = i
for ii in range(0, 3):
task = Task()
task.id = cnt
job.tasks[task.id] = task
cnt += 1
jobs.append(job)
for job in jobs:
print("job {}, tasks: {}".format(job.id, job.tasks.keys()))
The result is somehow surprising - we have 30 Tasks shared by each Job:
job 0, tasks: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
job 1, tasks: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
job 2, tasks: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
job 3, tasks: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
job 4, tasks: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
job 5, tasks: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
job 6, tasks: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
job 7, tasks: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
job 8, tasks: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
job 9, tasks: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
Can someone explain what is going on in here?
UPDATE
tasks is a class variable shared by all the instances.
In your Job class you need to do this
class Job:
id = None
def __init__(self):
self.tasks = {}
tasks is in your class and each time you are appending to the class tasks which is shared by all the instances.

Using key values from one list to group corresponding items in other lists [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 6 years ago.
Improve this question
I have 3 lists:
idd = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
row = [15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7]
col = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2]
what I'm trying to do is to create a dictionary where the keys are the grouped items of idd and the values are a list of list of row and col.
The final output should be something like:
dd = {1:[[15,15,15,15.....][1,2,3,4,....]], 2:[[13,13,13,13,....][6,7,8,9,...]]....}
Thanks!
Give this a shot:
from collections import defaultdict
grouped = defaultdict(lambda: [[], []])
for i, r, c in zip(idd, row, col):
grouped[i][0].append(r)
grouped[i][1].append(c)

SKlearn Random Forest error on input

I am trying to run the fit for my random forest, but I am getting the following error:
forest.fit(train[features], y)
returns
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-41-603415b5d9e6> in <module>()
----> 1 forest.fit(train[rubio_top_corr], y)
/usr/local/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in fit(self, X, y, sample_weight)
210 """
211 # Validate or convert input data
--> 212 X = check_array(X, dtype=DTYPE, accept_sparse="csc")
213 if issparse(X):
214 # Pre-sort indices to avoid that each individual tree of the
/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
396 % (array.ndim, estimator_name))
397 if force_all_finite:
--> 398 _assert_all_finite(array)
399
400 shape_repr = _shape_repr(array.shape)
/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in _assert_all_finite(X)
52 and not np.isfinite(X).all()):
53 raise ValueError("Input contains NaN, infinity"
---> 54 " or a value too large for %r." % X.dtype)
55
56
ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
I have coerced my dataframe from float64 to float32 for my features and made sure that there's no nulls so not sure what's throwing this error. Let me know if it would be helpful to put in more of my code.
UPDATE
It's originally a pandas dataframe, which I dropped all NaNs. The original dataframe is survey results with respondent information, where I dropped all questions except for my dv. I double checked this by running rforest_df.isnull().sum() which returned 0. This is the full code I used for the modeling.
rforest_df = qfav3_only
rforest_df[features] = rforest_df[features].astype(np.float32)
rforest_df['is_train'] = np.random.uniform(0, 1, len(rforest_df)) <= .75
train, test = rforest_df[rforest_df['is_train']==True], rforest_df[rforest_df['is_train']==False]
forest = RFC(n_jobs=2,n_estimators=50)
y, _ = pd.factorize(train['K6_QFAV3'])
forest.fit(train[features], y)
Update
This is what y data looks like
array([ 0, 1, 2, 3, 4, 3, 3, 5, 6, 7, 8, 7, 9, 6, 10, 6, 11,
7, 11, 3, 7, 9, 6, 5, 9, 11, 12, 13, 6, 11, 3, 3, 6, 14,
15, 0, 9, 9, 2, 0, 11, 3, 9, 4, 9, 7, 3, 4, 9, 12, 9,
7, 6, 13, 6, 0, 0, 16, 6, 11, 4, 10, 11, 11, 17, 3, 6, 16,
3, 4, 18, 19, 7, 11, 5, 11, 5, 4, 0, 6, 17, 7, 2, 3, 5,
11, 8, 9, 18, 6, 9, 8, 5, 16, 20, 0, 4, 8, 13, 16, 3, 20,
0, 5, 4, 2, 11, 0, 3, 0, 6, 6, 6, 9, 4, 6, 5, 11, 0,
13, 6, 2, 11, 7, 5, 6, 18, 12, 21, 17, 3, 6, 0, 13, 21, 7,
3, 2, 18, 22, 7, 3, 2, 6, 7, 8, 4, 0, 7, 12, 3, 7, 3,
2, 11, 19, 11, 6, 2, 9, 3, 7, 9, 9, 5, 6, 8, 0, 18, 11,
3, 12, 2, 6, 4, 7, 7, 11, 3, 6, 6, 0, 6, 12, 15, 3, 9,
3, 3, 0, 5, 9, 7, 9, 11, 7, 3, 20, 0, 7, 6, 6, 23, 15,
19, 0, 3, 6, 16, 13, 5, 6, 6, 3, 6, 11, 9, 0, 6, 23, 16,
4, 0, 6, 17, 11, 17, 11, 4, 3, 13, 3, 17, 16, 11, 7, 4, 24,
5, 2, 7, 7, 8, 3, 3, 11, 8, 7, 23, 7, 7, 11, 7, 11, 6,
15, 3, 25, 7, 4, 5, 3, 17, 20, 3, 26, 7, 9, 6, 6, 17, 20,
1, 0, 11, 9, 16, 20, 7, 7, 26, 3, 6, 20, 7, 2, 11, 7, 27,
9, 4, 26, 28, 8, 6, 9, 19, 7, 29, 3, 2, 26, 30, 6, 31, 6,
18, 3, 0, 18, 4, 7, 32, 0, 2, 8, 0, 5, 9, 4, 16, 6, 23,
0, 7, 0, 7, 9, 6, 8, 3, 7, 9, 3, 3, 12, 11, 8, 19, 20,
7, 3, 5, 11, 3, 11, 8, 4, 4, 6, 9, 4, 1, 3, 0, 9, 9,
6, 7, 8, 33, 8, 7, 9, 34, 11, 11, 6, 9, 9, 17, 8, 19, 0,
7, 4, 17, 6, 7, 0, 4, 12, 7, 6, 4, 16, 12, 9, 6, 6, 6,
6, 26, 13, 9, 7, 2, 7, 3, 11, 3, 6, 7, 19, 4, 8, 9, 13,
11, 15, 11, 4, 18, 7, 7, 7, 0, 5, 4, 6, 0, 3, 7, 4, 25,
18, 6, 19, 7, 9, 4, 20, 6, 3, 7, 4, 35, 15, 11, 2, 12, 0,
7, 32, 6, 18, 9, 9, 6, 2, 3, 19, 36, 32, 0, 7, 0, 9, 37,
3, 5, 6, 5, 34, 2, 6, 0, 7, 0, 7, 3, 7, 4, 18, 18, 7,
3, 7, 16, 9, 19, 13, 4, 16, 19, 3, 19, 38, 9, 4, 9, 8, 0,
17, 0, 2, 3, 5, 6, 5, 11, 11, 2, 9, 5, 33, 9, 5, 6, 20,
13, 3, 39, 13, 7, 0, 9, 0, 4, 6, 7, 16, 7, 0, 21, 5, 3,
18, 5, 20, 2, 2, 14, 6, 17, 11, 11, 16, 16, 9, 8, 11, 3, 23,
0, 11, 0, 6, 0, 0, 3, 16, 6, 7, 5, 9, 7, 13, 0, 20, 0,
25, 6, 16, 8, 4, 4, 2, 8, 7, 5, 40, 3, 8, 5, 12, 8, 9,
6, 6, 6, 6, 3, 7, 26, 4, 0, 13, 4, 3, 13, 12, 7, 7, 6,
7, 19, 15, 0, 33, 4, 5, 5, 20, 3, 11, 5, 4, 7, 9, 7, 11,
36, 9, 0, 6, 6, 11, 6, 4, 2, 5, 18, 8, 5, 5, 2, 25, 4,
41, 7, 7, 5, 7, 3, 36, 11, 6, 9, 0, 9, 0, 16, 42, 11, 11,
18, 9, 5, 36, 2, 9, 6, 3, 43, 9, 17, 13, 5, 9, 3, 4, 6,
44, 37, 0, 45, 2, 18, 8, 46, 2, 12, 9, 9, 3, 16, 6, 12, 9,
0, 11, 11, 0, 25, 8, 17, 4, 4, 3, 11, 3, 11, 6, 6, 9, 7,
23, 0, 2, 0, 3, 3, 4, 4, 9, 5, 11, 16, 7, 3, 18, 11, 7,
6, 6, 6, 5, 9, 6, 3, 9, 7, 17, 11, 4, 9, 2, 3, 0, 26,
9, 0, 20, 8, 9, 6, 11, 6, 6, 7, 26, 6, 6, 4, 19, 5, 41,
19, 18, 29, 6, 5, 13, 6, 11, 7, 7, 6, 8, 5, 0, 3, 13, 17,
6, 20, 11, 6, 9, 6, 2, 7, 11, 9, 20, 12, 7, 6, 8, 7, 4,
6, 2, 0, 7, 9, 26, 9, 16, 7, 4, 45, 7, 0, 23, 8, 4, 19,
4, 26, 11, 4, 4, 5, 7, 3, 0, 29, 12, 3, 4, 11, 4, 12, 8,
7, 5, 0, 47, 12, 0, 25, 6, 16, 20, 5, 8, 4, 4, 11, 12, 0,
6, 3, 11, 4, 3, 48, 3, 6, 7, 4, 7, 0, 3, 7, 3, 18, 6,
2, 9, 9, 11, 3, 9, 6, 18, 16, 6, 34, 2, 7, 4, 3, 45, 5,
0, 7, 2, 17, 17, 9, 18, 5, 6, 5, 15, 5, 7, 6, 9, 0, 7,
12, 17])
I would first recommend that you check the datatype for each column within your train[features] df in the following way:
print train[features].dtypes
If you see there are columns that are non-numeric, you can inspect these columns to ensure there are not any unexpected values (e.g. strings, NaNs, etc.) that would cause problems. If you do not mind dropping out non-numeric columns, you can simply select all the numeric columns with the following:
numeric_cols = X.select_dtypes(include=['float64','float32']).columns
You can also add in columns with int dtypes if you would like.
If you are encountering values that are too large or too small for the model to handle, it is a sign that scaling the data is a good idea. In sklearn, this can be accomplished as follows:
scaler = MinMaxScaler(feature_range=(0,1),copy=True).fit(train[features])
train[features] = scaler.transform(train[features])
Finally you should consider imputing missing values with sklearn's Imputer as well as filling NaNs with something like the following:
train[features].fillna(0, inplace=True)
It happens when you have null-strings (like '') in the dataset. Also try to print something like
pd.value_counts()
or even
sorted(list(set(...)))
or get min or max values for each column in the dataset in the loop.
Above example with MinMaxScaler may work, but scaled features do not work well in the RF.

unique (for days) in datetime array in python

I have an array in python called dates with this content:
[datetime.datetime(2012, 1, 11, 17, 24, 12, 676000), datetime.datetime(2012, 2, 3, 11, 25, 17, 73000), datetime.datetime(2012, 2, 3, 14, 9, 23, 699000), datetime.datetime(2012, 2, 4, 9, 15, 26, 644000), datetime.datetime(2012, 2, 4, 17, 14, 36, 65000), datetime.datetime(2012, 2, 5, 6, 18, 31, 139000), datetime.datetime(2012, 2, 5, 14, 55, 28, 62000), datetime.datetime(2012, 2, 5, 18, 28, 59, 379000), datetime.datetime(2012, 2, 6, 12, 24, 21, 768000), datetime.datetime(2012, 2, 6, 17, 32, 46, 675000), datetime.datetime(2012, 2, 14, 11, 33, 6, 74000), datetime.datetime(2012, 2, 14, 11, 36, 48, 11000), datetime.datetime(2012, 2, 16, 8, 54, 14, 175000), datetime.datetime(2012, 2, 16, 18, 33, 9, 200000), datetime.datetime(2012, 2, 20, 8, 41, 2, 550000), datetime.datetime(2012, 2, 20, 9, 4, 37, 446000), datetime.datetime(2012, 2, 20, 10, 10, 42, 950000), datetime.datetime(2012, 2, 20, 21, 21, 21, 986000), datetime.datetime(2012, 2, 21, 9, 1, 8, 429000), datetime.datetime(2012, 2, 21, 12, 5, 20, 475000), datetime.datetime(2012, 2, 21, 13, 23, 25, 281000), datetime.datetime(2012, 2, 21, 15, 4, 29, 366000), datetime.datetime(2012, 2, 21, 15, 12, 21, 729000), datetime.datetime(2012, 2, 21, 15, 29, 10, 723000), datetime.datetime(2012, 2, 21, 18, 10, 24, 822000), datetime.datetime(2012, 2, 22, 10, 42, 11, 689000), datetime.datetime(2012, 2, 22, 13, 10, 1, 309000), datetime.datetime(2012, 2, 22, 20, 28, 34, 260000), datetime.datetime(2012, 2, 27, 17, 53, 19, 225000), datetime.datetime(2012, 2, 28, 8, 13, 57, 139000), datetime.datetime(2012, 3, 2, 7, 55, 11, 505000), datetime.datetime(2012, 3, 2, 21, 6, 35, 270000), datetime.datetime(2012, 3, 5, 8, 10, 47, 76000), datetime.datetime(2012, 3, 5, 9, 15, 15, 448000), datetime.datetime(2012, 3, 7, 18, 15, 35, 401000), datetime.datetime(2012, 3, 15, 8, 6, 56, 968000), datetime.datetime(2012, 3, 16, 15, 34, 10, 59000), datetime.datetime(2012, 3, 20, 18, 19, 13, 687000), datetime.datetime(2012, 3, 22, 8, 50, 28, 983000), datetime.datetime(2012, 3, 23, 8, 26, 5, 468000), datetime.datetime(2012, 3, 27, 7, 50, 14, 474000), datetime.datetime(2012, 3, 27, 15, 14, 35, 59000), datetime.datetime(2012, 4, 5, 7, 23, 1, 374000), datetime.datetime(2012, 4, 6, 13, 8, 59, 578000), datetime.datetime(2012, 4, 6, 13, 34, 24, 843000), datetime.datetime(2012, 4, 6, 15, 35, 40, 538000), datetime.datetime(2012, 4, 10, 7, 0, 37, 455000), datetime.datetime(2012, 4, 10, 7, 12, 37, 199000), datetime.datetime(2012, 4, 10, 7, 39, 16, 366000), datetime.datetime(2012, 4, 10, 7, 55, 51, 228000), datetime.datetime(2012, 4, 11, 7, 53, 31, 699000), datetime.datetime(2012, 4, 11, 15, 32, 21, 582000), datetime.datetime(2012, 4, 13, 10, 22, 4, 673000), datetime.datetime(2012, 4, 16, 7, 17, 20, 578000), datetime.datetime(2012, 11, 29, 16, 5, 21, 53000), datetime.datetime(2012, 11, 29, 16, 6, 15, 244000), datetime.datetime(2013, 1, 25, 9, 45, 48, 921000), datetime.datetime(2013, 2, 4, 18, 1, 1, 418000), datetime.datetime(2013, 2, 5, 6, 14, 55, 728000), datetime.datetime(2013, 2, 5, 17, 2, 11, 959000), datetime.datetime(2013, 2, 7, 6, 4, 8, 629000), datetime.datetime(2013, 2, 7, 18, 6, 47, 247000), datetime.datetime(2013, 2, 8, 5, 36, 55, 702000), datetime.datetime(2013, 2, 8, 8, 51, 46, 261000), datetime.datetime(2013, 2, 12, 5, 56, 37, 233000), datetime.datetime(2013, 2, 12, 16, 6, 25, 126000), datetime.datetime(2013, 2, 13, 7, 45, 33, 448000), datetime.datetime(2013, 2, 13, 10, 43, 15, 749000), datetime.datetime(2013, 2, 14, 6, 10, 27, 562000), datetime.datetime(2013, 2, 14, 16, 44, 45, 469000), datetime.datetime(2013, 2, 15, 6, 3, 12, 787000), datetime.datetime(2013, 2, 15, 14, 8, 40, 281000), datetime.datetime(2013, 2, 17, 11, 46, 41, 983000), datetime.datetime(2013, 2, 20, 15, 32, 52, 455000), datetime.datetime(2013, 2, 21, 16, 0, 40, 165000), datetime.datetime(2013, 2, 22, 9, 12, 55, 32000), datetime.datetime(2013, 2, 22, 15, 11, 45, 979000), datetime.datetime(2013, 2, 25, 6, 52, 49, 991000), datetime.datetime(2013, 2, 25, 8, 52, 8, 947000), datetime.datetime(2013, 2, 25, 9, 27, 7, 716000), datetime.datetime(2013, 2, 25, 9, 33, 21, 121000), datetime.datetime(2013, 2, 26, 7, 15, 0, 135000), datetime.datetime(2013, 2, 26, 16, 15, 39, 693000), datetime.datetime(2013, 2, 27, 6, 33, 23, 745000), datetime.datetime(2013, 2, 27, 17, 28, 47, 793000), datetime.datetime(2013, 2, 28, 5, 43, 32, 479000), datetime.datetime(2013, 2, 28, 17, 22, 15, 510000), datetime.datetime(2013, 3, 1, 6, 54, 21, 676000), datetime.datetime(2013, 3, 1, 15, 47, 19, 912000), datetime.datetime(2013, 3, 4, 17, 39, 55, 809000), datetime.datetime(2013, 3, 5, 6, 40, 35, 101000), datetime.datetime(2013, 3, 5, 17, 5, 4, 324000), datetime.datetime(2013, 3, 6, 6, 39, 42, 235000), datetime.datetime(2013, 3, 6, 16, 6, 29, 410000), datetime.datetime(2013, 3, 7, 6, 32, 56, 197000), datetime.datetime(2013, 3, 7, 17, 31, 39, 249000), datetime.datetime(2013, 3, 8, 6, 56, 44, 369000), datetime.datetime(2013, 3, 11, 7, 17, 20, 748000), datetime.datetime(2013, 3, 11, 17, 27, 43, 102000), datetime.datetime(2013, 3, 12, 7, 10, 24, 751000), datetime.datetime(2013, 3, 12, 10, 23, 44, 759000), datetime.datetime(2013, 3, 12, 15, 42, 20, 461000), datetime.datetime(2013, 3, 13, 7, 12, 40, 494000), datetime.datetime(2013, 3, 13, 12, 7, 24, 986000), datetime.datetime(2013, 3, 14, 6, 52, 10, 779000), datetime.datetime(2013, 3, 14, 16, 39, 12, 776000), datetime.datetime(2013, 3, 15, 7, 4, 26, 454000), datetime.datetime(2013, 3, 15, 16, 40, 37, 98000), datetime.datetime(2013, 3, 18, 6, 53, 56, 937000), datetime.datetime(2013, 3, 18, 16, 53, 26, 914000), datetime.datetime(2013, 3, 19, 6, 34, 41, 813000), datetime.datetime(2013, 3, 19, 17, 19, 59, 721000), datetime.datetime(2013, 3, 20, 6, 57, 37, 141000), datetime.datetime(2013, 3, 20, 15, 15, 43, 458000), datetime.datetime(2013, 3, 21, 15, 36, 12, 949000), datetime.datetime(2013, 3, 22, 6, 57, 21, 973000), datetime.datetime(2013, 3, 22, 15, 36, 14, 388000), datetime.datetime(2013, 3, 25, 7, 0, 43, 602000), datetime.datetime(2013, 3, 25, 18, 27, 0, 693000), datetime.datetime(2013, 3, 26, 17, 20, 48, 194000), datetime.datetime(2013, 3, 27, 7, 11, 17, 665000), datetime.datetime(2013, 3, 27, 18, 27, 41, 894000), datetime.datetime(2013, 3, 28, 7, 2, 8, 624000), datetime.datetime(2013, 3, 28, 11, 12, 22, 53000), datetime.datetime(2013, 4, 3, 5, 45, 23, 995000), datetime.datetime(2013, 4, 4, 6, 5, 39, 243000), datetime.datetime(2013, 4, 8, 6, 4, 34, 667000), datetime.datetime(2013, 4, 8, 17, 6, 8, 718000), datetime.datetime(2013, 4, 9, 6, 2, 32, 813000), datetime.datetime(2013, 4, 9, 15, 16, 46, 622000), datetime.datetime(2013, 4, 10, 5, 26, 16, 694000), datetime.datetime(2013, 4, 10, 18, 50, 54, 809000), datetime.datetime(2013, 4, 11, 15, 12, 29, 376000), datetime.datetime(2013, 4, 12, 6, 9, 38, 925000), datetime.datetime(2013, 4, 12, 14, 42, 32, 607000), datetime.datetime(2013, 4, 15, 10, 0, 59, 995000), datetime.datetime(2013, 4, 15, 10, 11, 42, 16000), datetime.datetime(2013, 4, 16, 6, 8, 3, 838000), datetime.datetime(2013, 4, 16, 15, 27, 35, 147000), datetime.datetime(2013, 4, 17, 6, 4, 44, 272000), datetime.datetime(2013, 4, 17, 15, 23, 0, 924000), datetime.datetime(2013, 4, 18, 6, 9, 55, 454000), datetime.datetime(2013, 4, 18, 15, 5, 43, 601000), datetime.datetime(2013, 4, 19, 6, 0, 38, 132000), datetime.datetime(2013, 4, 19, 16, 35, 26, 14000), datetime.datetime(2013, 4, 19, 17, 44, 17, 116000), datetime.datetime(2013, 4, 19, 17, 51, 48, 43000), datetime.datetime(2013, 4, 19, 17, 54, 30, 44000), datetime.datetime(2013, 4, 21, 14, 58, 56, 363000), datetime.datetime(2013, 4, 21, 15, 8, 11, 276000), datetime.datetime(2013, 4, 23, 6, 24, 57, 124000), datetime.datetime(2013, 4, 23, 15, 44, 30, 503000), datetime.datetime(2013, 4, 25, 6, 13, 9, 269000), datetime.datetime(2013, 4, 25, 15, 41, 11, 370000), datetime.datetime(2013, 4, 26, 6, 2, 17, 877000), datetime.datetime(2013, 4, 27, 16, 17, 34, 97000), datetime.datetime(2013, 4, 27, 18, 20, 57, 975000), datetime.datetime(2013, 4, 29, 10, 17, 41, 746000), datetime.datetime(2013, 4, 29, 16, 45, 18, 65000), datetime.datetime(2013, 4, 30, 6, 13, 2, 333000), datetime.datetime(2013, 4, 30, 15, 3, 22, 343000), datetime.datetime(2013, 5, 1, 7, 22, 40, 401000), datetime.datetime(2013, 5, 1, 11, 16, 38, 525000), datetime.datetime(2013, 5, 2, 6, 7, 7, 749000), datetime.datetime(2013, 5, 3, 12, 48, 22, 617000), datetime.datetime(2013, 5, 6, 6, 1, 1, 168000), datetime.datetime(2013, 5, 6, 14, 56, 48, 236000), datetime.datetime(2013, 5, 7, 16, 47, 4, 597000), datetime.datetime(2013, 5, 8, 15, 26, 52, 105000), datetime.datetime(2013, 5, 10, 6, 10, 39, 379000), datetime.datetime(2013, 5, 13, 6, 9, 57, 990000), datetime.datetime(2013, 5, 13, 19, 56, 15, 354000), datetime.datetime(2013, 5, 15, 16, 39, 9, 127000), datetime.datetime(2013, 5, 16, 5, 59, 27, 609000), datetime.datetime(2013, 5, 16, 14, 18, 33, 253000), datetime.datetime(2013, 5, 17, 6, 20, 11, 853000), datetime.datetime(2013, 5, 21, 15, 38, 10, 53000), datetime.datetime(2013, 5, 22, 5, 59, 8, 126000), datetime.datetime(2013, 5, 22, 15, 48, 55, 877000), datetime.datetime(2013, 5, 23, 5, 47, 4, 779000), datetime.datetime(2013, 5, 23, 16, 59, 16, 948000), datetime.datetime(2013, 5, 24, 10, 57, 34, 831000), datetime.datetime(2013, 5, 24, 12, 29, 17, 332000), datetime.datetime(2013, 5, 27, 17, 0, 14, 513000), datetime.datetime(2013, 6, 20, 7, 28, 45, 975000), datetime.datetime(2013, 6, 20, 13, 31, 13, 228000), datetime.datetime(2013, 6, 21, 6, 18, 47, 789000), datetime.datetime(2013, 7, 1, 6, 12, 3, 640000), datetime.datetime(2013, 7, 1, 14, 33, 9, 251000), datetime.datetime(2013, 7, 2, 14, 59, 0, 421000), datetime.datetime(2013, 7, 3, 6, 12, 58, 282000), datetime.datetime(2013, 7, 3, 17, 23, 38, 745000), datetime.datetime(2013, 7, 5, 13, 40, 44, 719000), datetime.datetime(2013, 7, 9, 14, 51, 27, 348000), datetime.datetime(2013, 7, 10, 5, 12, 3, 104000)]
It should be ordered by date. What I need to know is how many days apear on this array. If there are many dates of the same day, I'll count only 1.
I could do it "by hand", iterating over each point and checking to a temp variable and count the days, but isn't there a faster, proper way to "unique" by days?
thanks
You can use the datetime.date() method:
s = {d.date() for d in dates}
print len(s)
Since dates are hashable, you can put them in a set just fine...
Note that you could also get a count of how many times each date appeared:
import collections
print collections.Counter(d.date() for d in dates)
Or, even do a list of datetime instances keyed by date:
import collections
d = collections.defaultdict(list)
for dt in dates:
d[dt.date()].append(dt)
Although, I suppose that since the input is sorted, you could do the same thing more or less with itertools.groupby:
for date, dt_group in itertools.groupby(dates, key=lambda dt: dt.date()):
print date, list(dt_group)
If you simply want to count how many unique days there are, the following works:
print len({(i.day,i.month,i.year) for i in dates})
This is to ensure that it is infact the same date and not just the same day number, since 1st of November has the say .day as the 1st of december but they are obviously not the same day.

index out of range in DES python

I am really sorry for the long program I am complaining about it here, I am just trying to make up my own DES encryption code using python with the little knowledge I have. So I have written the following code: It returned an error saying :" m = (B[j][0] << 1) + B[j][5]
IndexError: bitarray index out of range". How can I solve that?
from bitarray import bitarray
iptable=[57, 49, 41, 33, 25, 17, 9, 1,
59, 51, 43, 35, 27, 19, 11, 3,
61, 53, 45, 37, 29, 21, 13, 5,
63, 55, 47, 39, 31, 23, 15, 7,
56, 48, 40, 32, 24, 16, 8, 0,
58, 50, 42, 34, 26, 18, 10, 2,
60, 52, 44, 36, 28, 20, 12, 4,
62, 54, 46, 38, 30, 22, 14, 6
]
pc1=[56, 48, 40, 32, 24, 16, 8,
0, 57, 49, 41, 33, 25, 17,
9, 1, 58, 50, 42, 34, 26,
18, 10, 2, 59, 51, 43, 35,
62, 54, 46, 38, 30, 22, 14,
6, 61, 53, 45, 37, 29, 21,
13, 5, 60, 52, 44, 36, 28,
20, 12, 4, 27, 19, 11, 3
]
expTable=[31, 0, 1, 2, 3, 4,
3, 4, 5, 6, 7, 8,
7, 8, 9, 10, 11, 12,
11, 12, 13, 14, 15, 16,
15, 16, 17, 18, 19, 20,
19, 20, 21, 22, 23, 24,
23, 24, 25, 26, 27, 28,
27, 28, 29, 30, 31, 0]
pc2 = [13, 16, 10, 23, 0, 4,
2, 27, 14, 5, 20, 9,
22, 18, 11, 3, 25, 7,
15, 6, 26, 19, 12, 1,
40, 51, 30, 36, 46, 54,
29, 39, 50, 44, 32, 47,
43, 48, 38, 55, 33, 52,
45, 41, 49, 35, 28, 31]
# The (in)famous S-boxes
__sbox = [
# S1
[14, 4, 13, 1, 2, 15, 11, 8, 3, 10, 6, 12, 5, 9, 0, 7,
0, 15, 7, 4, 14, 2, 13, 1, 10, 6, 12, 11, 9, 5, 3, 8,
4, 1, 14, 8, 13, 6, 2, 11, 15, 12, 9, 7, 3, 10, 5, 0,
15, 12, 8, 2, 4, 9, 1, 7, 5, 11, 3, 14, 10, 0, 6, 13],
# S2
[15, 1, 8, 14, 6, 11, 3, 4, 9, 7, 2, 13, 12, 0, 5, 10,
3, 13, 4, 7, 15, 2, 8, 14, 12, 0, 1, 10, 6, 9, 11, 5,
0, 14, 7, 11, 10, 4, 13, 1, 5, 8, 12, 6, 9, 3, 2, 15,
13, 8, 10, 1, 3, 15, 4, 2, 11, 6, 7, 12, 0, 5, 14, 9],
# S3
[10, 0, 9, 14, 6, 3, 15, 5, 1, 13, 12, 7, 11, 4, 2, 8,
13, 7, 0, 9, 3, 4, 6, 10, 2, 8, 5, 14, 12, 11, 15, 1,
13, 6, 4, 9, 8, 15, 3, 0, 11, 1, 2, 12, 5, 10, 14, 7,
1, 10, 13, 0, 6, 9, 8, 7, 4, 15, 14, 3, 11, 5, 2, 12],
# S4
[7, 13, 14, 3, 0, 6, 9, 10, 1, 2, 8, 5, 11, 12, 4, 15,
13, 8, 11, 5, 6, 15, 0, 3, 4, 7, 2, 12, 1, 10, 14, 9,
10, 6, 9, 0, 12, 11, 7, 13, 15, 1, 3, 14, 5, 2, 8, 4,
3, 15, 0, 6, 10, 1, 13, 8, 9, 4, 5, 11, 12, 7, 2, 14],
# S5
[2, 12, 4, 1, 7, 10, 11, 6, 8, 5, 3, 15, 13, 0, 14, 9,
14, 11, 2, 12, 4, 7, 13, 1, 5, 0, 15, 10, 3, 9, 8, 6,
4, 2, 1, 11, 10, 13, 7, 8, 15, 9, 12, 5, 6, 3, 0, 14,
11, 8, 12, 7, 1, 14, 2, 13, 6, 15, 0, 9, 10, 4, 5, 3],
# S6
[12, 1, 10, 15, 9, 2, 6, 8, 0, 13, 3, 4, 14, 7, 5, 11,
10, 15, 4, 2, 7, 12, 9, 5, 6, 1, 13, 14, 0, 11, 3, 8,
9, 14, 15, 5, 2, 8, 12, 3, 7, 0, 4, 10, 1, 13, 11, 6,
4, 3, 2, 12, 9, 5, 15, 10, 11, 14, 1, 7, 6, 0, 8, 13],
# S7
[4, 11, 2, 14, 15, 0, 8, 13, 3, 12, 9, 7, 5, 10, 6, 1,
13, 0, 11, 7, 4, 9, 1, 10, 14, 3, 5, 12, 2, 15, 8, 6,
1, 4, 11, 13, 12, 3, 7, 14, 10, 15, 6, 8, 0, 5, 9, 2,
6, 11, 13, 8, 1, 4, 10, 7, 9, 5, 0, 15, 14, 2, 3, 12],
# S8
[13, 2, 8, 4, 6, 15, 11, 1, 10, 9, 3, 14, 5, 0, 12, 7,
1, 15, 13, 8, 10, 3, 7, 4, 12, 5, 6, 11, 0, 14, 9, 2,
7, 11, 4, 1, 9, 12, 14, 2, 0, 6, 10, 13, 15, 3, 5, 8,
2, 1, 14, 7, 4, 10, 8, 13, 15, 12, 9, 0, 3, 5, 6, 11],
]
msg= bitarray(endian='little')
msg.frombytes(b'ABCDEFGH')
perm = bitarray(endian='little')
key= bitarray(endian='little')
key.frombytes(b'FFQQSSMM')
keyPc1 = bitarray(endian='little')
keyPc2 = bitarray(endian='little')
exp = bitarray(endian='little')
for z in pc1:
keyPc1.append(key[z])
c0 = keyPc1[0:28]
d0 = keyPc1[28:]
key0 = c0 + d0
#permutation of key
for k in pc2:
keyPc2.append(key0[k])
#permutation of message
for x in iptable:
perm.append(msg[x])
l1 = perm[0:32]
r1 = perm[32:]
#Expansion of R
for y in expTable:
exp.append(r1[y])
#XORing R & key
xor_rk = keyPc2 ^ exp
#Working with S-boxes!
B = [xor_rk[0:6], xor_rk[6:14], xor_rk[14:20], xor_rk[20:26], xor_rk[26:32], xor_rk[32:38], xor_rk[38:42], xor_rk[42:47]]
j = 0
Bn = [0] * 32
pos = 0
while j < 8:
# Work out the offsets
m = (B[j][0] << 1) + B[j][5]
n = (B[j][1] << 3) + (B[j][2] << 2) + (B[j][3] << 1) + B[j][4]
# Find the permutation value
v = __sbox[j][(m << 4) + n]
# Turn value into bits, add it to result: Bn
Bn[pos] = (v & 8) >> 3
Bn[pos + 1] = (v & 4) >> 2
Bn[pos + 2] = (v & 2) >> 1
Bn[pos + 3] = v & 1
pos += 4
j += 1
f = Bn[0] + Bn[1] + Bn[2] + Bn[3] + Bn[4] +Bn[5] + Bn[6] +Bn[7]
xor_lf = l ^ f
Not all parts of your B list are the same length. For example, this part:
xor_rk[38:42]
has a length of 4, so you can't get the 5th element of that. Is it supposed to have a length of 4? Or did you mean to count by sixes and screw up?

Categories

Resources