Series imported but unused error Python - python

import numpy as np
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
iris_df = DataFrame()
iris_data_path = 'Z:\WORK\Programming\Python\irisdata.csv'
iris_df = pd.read_csv(iris_data_path,index_col=False,header=None,encoding='utf-8')
iris_df.columns = ['sepal length','sepal width','petal length','petal width','class']
print iris_df.columns.values
print iris_df.head()
print iris_df.tail()
irisX = irisdata[['sepal length','sepal width','petal length','petal width']]
print irisX.tail()
irisy = irisdata['class']
print irisy.head()
print irisy.tail()
colors = ['red','green','blue']
markers = ['o','>','x']
irisyn = np.where(irisy=='Iris-setosa',0,np.where(irisy=='Iris-virginica',2,1))
Col0 = irisdata['sepal length']
Col1 = irisdata['sepal width']
Col2 = irisdata['petal length']
Col3 = irisdata['petal width']
plt.figure(num=1,figsize=(16,10))
plt.subplot(2,3.1)
for i in range(len(colors)):
xs = Col0[irisyn==i]
xy = Col1[irisyn==i]
plt.scatter(xs,xy,color=colors[i],marker=markers[i])
plt.legend( ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica') )
plt.xlabel(irisdata.columns[0])
plt.ylabel(irisdata.columns[1])
plt.subplot(2,3,2)
for i in range(len(colors)):
xs = Col0[irisyn==i]
xy = Col2[irisyn==i]
plt.scatter(xs,xy,color=colors[i],marker=markers[i])
plt.xlabel(irisdata.columns[0])
plt.ylabel(irisdata.columns[2])
plt.subplot(2,3,3)
for i in range(len(colors)):
xs = Col0[irisyn==i]
xy = Col3[irisyn==i]
plt.scatter(xs,xy,color=colors[i],marker=markers[i])
plt.xlabel(irisdata.columns[0])
plt.ylabel(irisdata.columns[3])
plt.subplot(2,3,4)
for i in range(len(colors)):
xs = Col1[irisyn==i]
xy = Col2[irisyn==i]
plt.scatter(xs,xy,color=colors[i],marker=markers[i])
plt.xlabel(irisdata.columns[1])
plt.ylabel(irisdata.columns[2])
plt.subplot(2,3,5)
for i in range(len(colors)):
xs = Col1[irisyn==i]
xy = Col3[irisyn==i]
plt.scatter(xs,xy,color=colors[i],marker=markers[i])
plt.xlabel(irisdata.columns[1])
plt.ylabel(irisdata.columns[3])
plt.subplot(2,3,6)
for i in range(len(colors)):
xs = Col2[irisyn==i]
xy = Col3[irisyn==i]
plt.scatter(xs,xy,color=colors[i],marker=markers[i])
plt.xlabel(irisdata.columns[2])
plt.ylabel(irisdata.columns[3])
plt.show()
This is code from Howard Bandy's book Quantitative Technical Analysis. The problem is that it is giving me errors even though I typed it out exactly like it is in the book.
I still get the series imported but unused and undefined name irisdata errors/warnings.
This is in the console:
Code:
runfile('Z:/WORK/Programming/Python/Scripts/irisplotpairsdata2.py', wdir='//AMN/annex/WORK/Programming/Python/Scripts')
['sepal length' 'sepal width' 'petal length' 'petal width' 'class']
sepal length sepal width petal length petal width class
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
sepal length sepal width petal length petal width class
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica
Traceback (most recent call last):
File "<ipython-input-100-f0b2002668bd>", line 1, in <module>
runfile('Z:/WORK/Programming/Python/Scripts/irisplotpairsdata2.py', wdir='//AMN/annex/WORK/Programming/Python/Scripts')
File "C:\MyPrograms\Spyder(Python)\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 685, in runfile
execfile(filename, namespace)
File "C:\MyPrograms\Spyder(Python)\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 71, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "Z:/WORK/Programming/Python/Scripts/irisplotpairsdata2.py", line 24, in <module>
irisX = irisdata[['sepal length','sepal width','petal length','petal width']]
TypeError: list indices must be integers, not list
Obviously, the program does not run.
I'm using spyder with python 2.7. Which is the platform he was using in the book.
Thanks for any insight.

Well Python is not wrong. You imported Series but never used, which is a warning that does not cause crash. The crash happens because you are dereferencing a variable, irisdata, which was never defined before. (Ctrl + f irisdata in your code and take a look.) Judging by your code, irisdataprobably needs to contain the parsed data of Z:\WORK\Programming\Python\irisdata.csv doesn't it? So you need to parse that out and assign it to irisdata. See this post
eg.
import csv
...
irisdata = list(csv.reader(open(iris_data_path, 'rb')))

Related

Efficient way to iterate over rows and columns in pandas

I have a pandas dataframe Bg that was created by taking sample in rows and r for in columns. r is a list of genes that I want to split in a row-wise manner for the entire dataframe.
My code below is taking a long time to run and repeatedly crash. I would like to know if there is a more efficient way to achieve the aim.
import pandas as pd
Bg = pd.DataFrame()
for idx, r in pathway_genes.itertuples():
for i, p in enumerate(M.index):
if idx == p:
for genes, samples in common_mrna.iterrows():
b = pd.DataFrame({r:samples})
Bg = Bg.append(b).fillna(0)
M.index
M.index = ['KEGG_VASOPRESSIN_REGULATED_WATER_REABSORPTION',
'KEGG_DRUG_METABOLISM_OTHER_ENZYMES', 'KEGG_PEROXISOME',
'KEGG_LONG_TERM_POTENTIATION', 'KEGG_ADHERENS_JUNCTION', 'KEGG_ALANINE_ASPARTATE_AND_GLUTAMATE_METABOLISM']
pathway_genes
geneSymbols
KEGG_ABC_TRANSPORTERS
['ABCA1', 'ABCA10', 'ABCA12']
KEGG_ACUTE_MYELOID_LEUKEMIA
['AKT1', 'AKT2', 'AKT3', 'ARAF']
KEGG_ADHERENS_JUNCTION
['ACP1', 'ACTB', 'ACTG1', 'ACTN1', 'ACTN2']
KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY
['ACACB', 'ACSL1', 'ACSL3', 'ACSL4', 'ACSL5']
KEGG_ALANINE_ASPARTATE_AND_GLUTAMATE_METABOLISM
['ABAT', 'ACY3', 'ADSL', 'ADSS1', 'ADSS2']
common_mrna
common_mrna = pd.DataFrame([[1.2, 1.3, 1.4, 1.5], [1.6,1.7,1.8,1.9], [2.0,2.1,2.2,2.3], [2.4,2.5,2.6,2.7], [2.8,2.9,3.0,3.1],[3.2,3.3,3.4,3.5],[3.6,3.7,3.8,3.9],[4.0,4.1,4.2,4.3],[4.4,4.5,4.6,4.7],[4.8,4.9,5.0,5.1],[5.2,5.3,5.4,5.5],[5.6,5.7,5.8,5.9],[6.0,6.1,6.2,6.3],[6.4,6.5,6.6,6.7],[6.8,6.9,7.0,7.1],[7.2,7.3,7.4,7.5],[7.6,7.7,7.8,7.9]], columns=['TCGA-02-0033-01', 'TCGA-02-2470-01', 'TCGA-02-2483-01', 'TCGA-06-0124-01'], index =['ABCA1','ABCA10','ABCA12','AKT1','AKT2','AKT3','ARAF','ACP1','ACTB','ACTG1','ACTN1','ACTN2','ABAT','ACY3','ADSL','ADSS1','ADSS2'])
Desired output:
Bg = pd.DataFrame([[4.0,4.1,4.2,4.3],[4.4,4.5,4.6,4.7],[4.8,4.9,5.0,5.1],[5.2,5.3,5.4,5.5],[5.6,5.7,5.8,5.9],[6.0,6.1,6.2,6.3],[6.4,6.5,6.6,6.7],[6.8,6.9,7.0,7.1],[7.2,7.3,7.4,7.5],[7.6,7.7,7.8,7.9]], columns=['TCGA-02-0033-01', 'TCGA-02-2470-01', 'TCGA-02-2483-01', 'TCGA-06-0124-01'], index =['ACP1','ACTB','ACTG1','ACTN1','ACTN2','ABAT','ACY3','ADSL','ADSS1','ADSS2'])
Firs of all, you can use list comprehension to match the M_index with the pathway_genes
pathway_genes = {'KEGG_ABC_TRANSPORTERS': ['ABCA1', 'ABCA10', 'ABCA12'], 'KEGG_ACUTE_MYELOID_LEUKEMIA': ['AKT1', 'AKT2', 'AKT3', 'ARAF'], 'KEGG_ADHERENS_JUNCTION': ['ACP1', 'ACTB', 'ACTG1', 'ACTN1', 'ACTN2'], 'KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY': ['ACACB', 'ACSL1', 'ACSL3', 'ACSL4', 'ACSL5'], 'KEGG_ALANINE_ASPARTATE_AND_GLUTAMATE_METABOLISM': ['ABAT', 'ACY3', 'ADSL', 'ADSS1', 'ADSS2']}
matched_index_symbols = [pathway_genes[i] for i in pathway_genes.keys() if i in M_index]
After that, you can use loc to match all the symbols.
flatten_list = [j for sub in matched_index_symbols for j in sub]
Bg = common_mrna.loc[flatten_list]
Out[26]:
TCGA-02-0033-01 TCGA-02-2470-01 TCGA-02-2483-01 TCGA-06-0124-01
ABCA1 1.2 1.3 1.4 1.5
ABCA10 1.6 1.7 1.8 1.9
ABCA12 2.0 2.1 2.2 2.3
ACP1 4.0 4.1 4.2 4.3
ACTB 4.4 4.5 4.6 4.7
ACTG1 4.8 4.9 5.0 5.1
ACTN1 5.2 5.3 5.4 5.5
ACTN2 5.6 5.7 5.8 5.9
ABAT 6.0 6.1 6.2 6.3
ACY3 6.4 6.5 6.6 6.7
ADSL 6.8 6.9 7.0 7.1
ADSS1 7.2 7.3 7.4 7.5
ADSS2 7.6 7.7 7.8 7.9
Update
It seems that your pathway_genes is not originally a dictionary but a dataframe. If that's the case, you can extract the column index of the dataframe.
pathway_genes
Out[46]:
geneSymbols
KEGG_ABC_TRANSPORTERS [ABCA1, ABCA10, ABCA12]
KEGG_ACUTE_MYELOID_LEUKEMIA [AKT1, AKT2, AKT3, ARAF]
KEGG_ADHERENS_JUNCTION [ACP1, ACTB, ACTG1, ACTN1, ACTN2]
KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY [ACACB, ACSL1, ACSL3, ACSL4, ACSL5]
KEGG_ALANINE_ASPARTATE_AND_GLUTAMATE_METABOLISM [ABAT, ACY3, ADSL, ADSS1, ADSS2]
matched_index_symbols = np.array([pathway_genes['geneSymbols'].loc[i] for i in pathway_genes.index if i in M_index])
flatten_list = matched_index_symbols.ravel()

Subclassing pandas dataframe and setting field in constuctor

I'm trying to subclass pandas data structure. If I set a field on the instance, it works fine.
import seaborn as sns
import pandas as pd
df = sns.load_dataset('iris')
class Results(pd.DataFrame):
def __init__(self, *args, **kwargs):
# use the __init__ method from DataFrame to ensure
# that we're inheriting the correct behavior
super(Results, self).__init__(*args, **kwargs)
#property
def _constructor(self):
return Results
result_object = Results(df)
result_object['scheme'] = 'not_default'
print(result_object.head(5))
>>> sepal_length sepal_width petal_length petal_width species scheme
0 5.1 3.5 1.4 0.2 setosa not_default
1 4.9 3.0 1.4 0.2 setosa not_default
2 4.7 3.2 1.3 0.2 setosa not_default
3 4.6 3.1 1.5 0.2 setosa not_default
4 5.0 3.6 1.4 0.2 setosa not_default
I don't quite understand the _constructor method under the hood well enough to tell why this does not work.
import seaborn as sns
import pandas as pd
df = sns.load_dataset('iris')
class Results(pd.DataFrame):
def __init__(self, *args,scheme='default', **kwargs):
# use the __init__ method from DataFrame to ensure
# that we're inheriting the correct behavior
super(Results, self).__init__(*args, **kwargs)
self['scheme'] = scheme
#property
def _constructor(self):
return Results
result_object = Results(df.copy(),scheme='not_default')
print(result_object.head(5))
>>>
# scheme is still 'default'
sepal_length sepal_width petal_length petal_width species scheme
0 5.1 3.5 1.4 0.2 setosa default
1 4.9 3.0 1.4 0.2 setosa default
2 4.7 3.2 1.3 0.2 setosa default
3 4.6 3.1 1.5 0.2 setosa default
4 5.0 3.6 1.4 0.2 setosa default
Notice the scheme field still says default.
Is there anyway to set a field in the instance constructor?
Your current version creates scheme as an attribute (like .index, .columns):
result_object.scheme
# 0 not_default
# 1 not_default
# ...
# 148 not_default
# 149 not_default
# Name: scheme, Length: 150, dtype: object
To make it a proper column, you can modify the incoming data before sending it to super():
class Results(pd.DataFrame):
def __init__(self, data=None, *args, scheme='default', **kwargs):
# add column to incoming data
if isinstance(data, pd.DataFrame):
data['scheme'] = scheme
super(Results, self).__init__(data=data, *args, **kwargs)
#property
def _constructor(self):
return Results
df = sns.load_dataset('iris')
result_object = Results(df.copy(), scheme='not_default')
# sepal_length sepal_width petal_length petal_width species scheme
# 0 5.1 3.5 1.4 0.2 setosa not_default
# 1 4.9 3.0 1.4 0.2 setosa not_default
# 2 4.7 3.2 1.3 0.2 setosa not_default
# 3 4.6 3.1 1.5 0.2 setosa not_default
# ... ... ... ... ... ... ...

Multiple hover_name for 3D plot in Python Plotly

I would like to add more hover text using multiple columns for my 3D plot model.
For example:
df:
sepal_length sepal_width petal_length petal_width species species_id
0 5.1 3.5 1.4 0.2 setosa 1
1 4.9 3.0 1.4 0.2 setosa 1
2 4.7 3.2 1.3 0.2 setosa 1
3 4.6 3.1 1.5 0.2 setosa 1
4 5.0 3.6 1.4 0.2 setosa 1
5 5.4 3.9 1.7 0.4 setosa 1
Code:
import plotly.express as px
df = px.data.iris()
fig = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_width',
color='petal_length', symbol='species', hover_name="species")
fig.show()
produced plot
In the plot, the hover_name="species" shows only species in the hover_name. How can I include species_id in hover_name as well?
Simply add additional information in hover_data argument below:
import plotly.express as px
df = px.data.iris()
fig = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_width',
color='petal_length', symbol='species', hover_name="species", hover_data=["species", "species_id"])
fig.show()
Docs could be found here Customizing Hover text with Plotly Express

Find the range of all columns (difference between maximum and minimum) while gracefully handling string columns

I have a scenario where I have to find the range of all the columns in a dataset which contains multiple columns with numeric value but one column has string values.
Please find sample records from my data set below:
import seaborn as sns
iris = sns.load_dataset('iris')
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
The maximum and minimum of these columns are given by
sepal_length 7.9
sepal_width 4.4
petal_length 6.9
petal_width 2.5
species virginica
dtype: object
and
sepal_length 4.3
sepal_width 2
petal_length 1
petal_width 0.1
species setosa
dtype: object
...respectively. To find the range of all the columns I can use the below code:
iris.max() - iris.min()
But as the column 'species' has string values, the above code is throwing the below error:
TypeError: unsupported operand type(s) for -: 'str' and 'str'
If the above error occurs, I want to print the value as the
"{max string value}" - "{min string value}"
IOW, my expected output would be something like:
sepal_length 3.6
sepal_width 2.4
petal_length 5.9
petal_width 2.4
species virginica - setosa
How do I resolve this issue?
Handle the numeric and string columns separately. You can select these using df.select_dtypes. Finally, concat the result.
u = Iris.select_dtypes(include=[np.number])
# U = u.apply(np.ptp, axis=0)
U = u.max() - u.min()
v = Iris.select_dtypes(include=[object])
V = v.max() + ' - ' + v.min()
U.append(V)
sepal_length 3.6
sepal_width 2.4
petal_length 5.9
petal_width 2.4
species virginica - setosa
dtype: object

Preprocessing csv files to use with tflearn

My question is about preprocessing csv files before inputing them into a neural network.
I want to build a deep neural network for the famous iris dataset using tflearn in python 3.
Dataset: http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
I'm using tflearn to load the csv file. However, the classes column of my data set has words such as iris-setosa, iris-versicolor, iris-virginica.
Nueral networks work only with numbers. So, I have to find a way to change the classes from words to numbers. Since it is a very small dataset, I can do it manually using Excel/text editor. I manually assigned numbers for different classes.
But, I can't possibly do it for every dataset I work with. So, I tried using pandas to perform one hot encoding.
preprocess_data = pd.read_csv("F:\Gautam\.....\Dataset\iris_data.csv")
preprocess_data = pd.get_dummies(preprocess_data)
But now, I can't use this piece of code:
data, labels = load_csv('filepath', categorical_labels=True,
n_classes=3)
'filepath' should only be a directory to the csv file, not any variable like preprocess_data.
Original Dataset:
Sepal Length Sepal Width Petal Length Petal Width Class
89 5.5 2.5 4.0 1.3 iris-versicolor
85 6.0 3.4 4.5 1.6 iris-versicolor
31 5.4 3.4 1.5 0.4 iris-setosa
52 6.9 3.1 4.9 1.5 iris-versicolor
111 6.4 2.7 5.3 1.9 iris-virginica
Manually modified dataset:
Sepal Length Sepal Width Petal Length Petal Width Class
89 5.5 2.5 4.0 1.3 1
85 6.0 3.4 4.5 1.6 1
31 5.4 3.4 1.5 0.4 0
52 6.9 3.1 4.9 1.5 1
111 6.4 2.7 5.3 1.9 2
Here's my code which runs perfectly, but, I have modified the dataset manually.
import numpy as np
import pandas as pd
import tflearn
from tflearn.layers.core import input_data, fully_connected
from tflearn.layers.estimator import regression
from tflearn.data_utils import load_csv
data_source = 'F:\Gautam\.....\Dataset\iris_data.csv'
data, labels = load_csv(data_source, categorical_labels=True,
n_classes=3)
network = input_data(shape=[None, 4], name='InputLayer')
network = fully_connected(network, 9, activation='sigmoid', name='Hidden_Layer_1')
network = fully_connected(network, 3, activation='softmax', name='Output_Layer')
network = regression(network, batch_size=1, optimizer='sgd', learning_rate=0.2)
model = tflearn.DNN(network)
model.fit(data, labels, show_metric=True, run_id='iris_dataset', validation_set=0.1, n_epoch=2000)
I want to know if there's any other built-in function in tflearn (or in any other module, for that matter) that I can use to modify the value of my classes from words to numbers. I don't think manually modifying the datasets would be productive.
I'm a beginner in tflearn and neural networks also. Any help would be appreciated. Thanks.
Use label encoder from sklearn library:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
df = pd.read_csv('iris_data.csv',header=None)
df.columns=[Sepal Length,Sepal Width,Petal Length,Petal Width,Class]
enc=LabelEncoder()
df['Class']=enc.fit_transform(df['Class'])
print df.head(5)
if you want One-hot encoding then first you need to labelEncode then do OneHotEncoding :
enc=LabelEncoder()
enc_1=OneHotEncoder()
df['Class']=enc.fit_transform(df['Class'])
df['Class']=enc_1.fit_transform([df['Class']]).toarray()
print df.head(5)
These encoders first sort the words in alphabetical order then assign them labels. If you want to see which label is assigned to which class, do:
for k in list(enc.classes_) :
print 'name ::{}, label ::{}'.format(k,enc.transform([k]))
If you want to save this dataframe as a csv file, do:
df.to_csv('Processed_Irisdataset.csv',sep=',')
The simpliest solution is map by dict of all possible values:
df['Class'] = df['Class'].map({'iris-versicolor': 1, 'iris-setosa': 0, 'iris-virginica': 2})
print (df)
Sepal Length Sepal Width Petal Length Petal Width Class
0 89 5.5 2.5 4.0 1.3 1
1 85 6.0 3.4 4.5 1.6 1
2 31 5.4 3.4 1.5 0.4 0
3 52 6.9 3.1 4.9 1.5 1
4 111 6.4 2.7 5.3 1.9 2
If want generate dictionary by all unique values:
d = {v:k for k, v in enumerate(df['Class'].unique())}
print (d)
{'iris-versicolor': 0, 'iris-virginica': 2, 'iris-setosa': 1}
df['Class'] = df['Class'].map(d)
print (df)
Sepal Length Sepal Width Petal Length Petal Width Class
0 89 5.5 2.5 4.0 1.3 0
1 85 6.0 3.4 4.5 1.6 0
2 31 5.4 3.4 1.5 0.4 1
3 52 6.9 3.1 4.9 1.5 0
4 111 6.4 2.7 5.3 1.9 2

Categories

Resources