creating numpy array from matrix with header

creating numpy array from matrix with header - python

I'm stuck at creating a numpy array from a file. This is the code, where short_logs.txt is a data array with headers (no row names)
So matrix and code look like this:
a b c d
12 3 5 6
9 8 45 8
log_file = open('short_logs.txt')
samples = log_file.readline()
log_names = samples.split()
fields = zip(log_names, ['f8']*len(log_names))
fields_dtype = dtype(fields)
logs = loadtxt(log_file, dtype=fields_dtype)
But then I get this error and I don't understand why.
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
C:\IPython\utils\py3compat.pyc in execfile(fname, glob, loc)
195 else:
196 filename = fname
--> 197 exec compile(scripttext, filename, 'exec') in glob, loc
198 else:
199 def execfile(fname, *where):
c:\users\temp\tmp4iajle.py in <module>()
17
18
---> 19 logs = loadtxt(log_file, dtype=fields_dtype)
C:\Users\numpy\lib\npyio.pyc in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin)
848 items = [conv(val) for (conv, val) in zip(converters, vals)]
849 # Then pack it according to the dtype's nesting
--> 850 items = pack_items(items, packing)
851 X.append(items)
852 finally:
C:\Users\numpy\lib\npyio.pyc in pack_items(items, packing)
782 ret = []
783 for length, subpacking in packing:
--> 784 ret.append(pack_items(items[start:start+length], subpacking))
785 start += length
786 return tuple(ret)
C:\Users\numpy\lib\npyio.pyc in pack_items(items, packing)
773 """Pack items into nested lists based on re-packing info."""
774 if packing == None:
--> 775 return items[0]
776 elif packing is tuple:
777 return tuple(items)
IndexError: list index out of range`
Can someone point me in the right direction?
Thank you

Related

.describe() and .info() not working for me in Jupyter Notebook

I am trying to use the describe method to get summary statistics of my data but I keep on getting this error message. Anyway to sort this out? The .info() is also giving me the same problem.
TypeError Traceback (most recent call last)
<ipython-input-28-614cd2726f37> in <module>
----> 1 players_final.describe()
~\anaconda3\lib\site-packages\pandas\core\generic.py in describe(self, percentiles, include, exclude)
10265 elif (include is None) and (exclude is None):
10266 # when some numerics are found, keep only numerics
> 10267 data = self.select_dtypes(include=[np.number])
10268 if len(data.columns) == 0:
10269 data = self
~\anaconda3\lib\site-packages\pandas\core\frame.py in select_dtypes(self, include, exclude)
3420 # the "union" of the logic of case 1 and case 2:
3421 # we get the included and excluded, and return their logical and
-> 3422 include_these = Series(not bool(include), index=self.columns)
3423 exclude_these = Series(not bool(exclude), index=self.columns)
3424
~\anaconda3\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
309 data = data.copy()
310 else:
--> 311 data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True)
312
313 data = SingleBlockManager(data, index, fastpath=True)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
710 value = maybe_cast_to_datetime(value, dtype)
711
--> 712 subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype)
713
714 else:
~\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in construct_1d_arraylike_from_scalar(value, length, dtype)
1231 value = ensure_str(value)
1232
-> 1233 subarr = np.empty(length, dtype=dtype)
1234 subarr.fill(value)
1235
TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type

Scipy Fourier Transform KeyError: 'ALIGNED'?

I'm trying to run a fast fourier transform on a pandas dataframe that I have. I am using the Kepler exoplanet dataset, here, and a specific notebook for it, here. I recreate the code in cells 27-30 (Note that the code in cell 29 is executed elsewhere, thus both dataframes have the same shape as the original notebook), which looks as follows:
import scipy
def spectrum_getter(X):
Spectrum = scipy.fft.fft(X, n=X.size)
return np.abs(Spectrum)
x_train_OS_FT = x_train_OS.apply(spectrum_getter, axis=1)
x_test_FT = x_test.apply(spectrum_getter, axis=1)
Both x_train_OS and x_test are pandas.core.frame.DataFrame. Running this produces:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Input In [245], in <module>
----> 1 x_train_OS_FT = x_train_OS.apply(spectrum_getter, axis=1)
2 x_test_FT = x_test.apply(spectrum_getter, axis=1)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\frame.py:8827, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
8816 from pandas.core.apply import frame_apply
8818 op = frame_apply(
8819 self,
8820 func=func,
(...)
8825 kwargs=kwargs,
8826 )
-> 8827 return op.apply().__finalize__(self, method="apply")
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:727, in FrameApply.apply(self)
724 elif self.raw:
725 return self.apply_raw()
--> 727 return self.apply_standard()
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:851, in FrameApply.apply_standard(self)
850 def apply_standard(self):
--> 851 results, res_index = self.apply_series_generator()
853 # wrap results
854 return self.wrap_results(results, res_index)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\apply.py:867, in FrameApply.apply_series_generator(self)
864 with option_context("mode.chained_assignment", None):
865 for i, v in enumerate(series_gen):
866 # ignore SettingWithCopy here in case the user mutates
--> 867 results[i] = self.f(v)
868 if isinstance(results[i], ABCSeries):
869 # If we have a view on v, we need to make a copy because
870 # series_generator will swap out the underlying data
871 results[i] = results[i].copy(deep=False)
Input In [244], in spectrum_getter(X)
3 def spectrum_getter(X):
----> 4 Spectrum = scipy.fft.fft(X, n=X.size)
5 return np.abs(Spectrum)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_backend.py:22, in _ScipyBackend.__ua_function__(method, args, kwargs)
20 if fn is None:
21 return NotImplemented
---> 22 return fn(*args, **kwargs)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_pocketfft\basic.py:17, in c2c(forward, x, n, axis, norm, overwrite_x, workers, plan)
14 if plan is not None:
15 raise NotImplementedError('Passing a precomputed plan is not yet '
16 'supported by scipy.fft functions')
---> 17 tmp = _asfarray(x)
18 overwrite_x = overwrite_x or _datacopied(tmp, x)
19 norm = _normalization(norm, forward)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\scipy\fft\_pocketfft\helper.py:97, in _asfarray(x)
95 dtype = x.dtype.newbyteorder('=')
96 # Always align input
---> 97 copy = not x.flags['ALIGNED']
98 return np.array(x, dtype=dtype, copy=copy)
File c:\users\marti\appdata\local\programs\python\python39\lib\site-packages\pandas\core\flags.py:98, in Flags.__getitem__(self, key)
96 def __getitem__(self, key):
97 if key not in self._keys:
---> 98 raise KeyError(key)
100 return getattr(self, key)
KeyError: 'ALIGNED'
I attempted to convert the dataframe to a numpy array, but ran into other issues. What am I doing wrong here?

I ran to the same error so I converted my datatype to dataframe and it solved my problem.

ValueError: Shape of passed values is (37679, 43), indices imply (37679, 41)

I am trying to group horse data by races. I am using pivot function to try do this, but I keep getting a Value error.
def group_horse_and_result(element):
if element[0] == 'placing':
return 100 + element[1]
else:
return element[1]
data = data.pivot(index='id', columns='barrier', values=data.columns[2:])
rearranged_columns = sorted(list(data.columns.values), key=group_horse_and_result)
data = data[rearranged_columns]
print(data.head())
data.fillna(0)
And I keep getting this error result:
AssertionError Traceback (most recent call last)
<ipython-input-253-97da160dc172> in <module>
5 return element[1]
6
----> 7 data = data.pivot(index='race_id', columns='placing', values=data.columns[2:])
8 rearranged_columns = sorted(list(data.columns.values), key=group_horse_and_result)
9 data = data[rearranged_columns]
~\anaconda3\lib\site-packages\pandas\core\frame.py in pivot(self, index, columns, values)
6672 from pandas.core.reshape.pivot import pivot
6673
-> 6674 return pivot(self, index=index, columns=columns, values=values)
6675
6676 _shared_docs[
~\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in pivot(data, index, columns, values)
470 # Exclude tuple because it is seen as a single column name
471 values = cast(Sequence[Label], values)
--> 472 indexed = data._constructor(
473 data[values]._values, index=index, columns=values
474 )
~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
495 mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
496 else:
--> 497 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
498
499 # For data is list-like, or Iterable (will consume into list)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_ndarray(values, index, columns, dtype, copy)
232 block_values = [values]
233
--> 234 return create_block_manager_from_blocks(block_values, [columns, index])
235
236
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
1663 ]
1664
-> 1665 mgr = BlockManager(blocks, axes)
1666 mgr._consolidate_inplace()
1667 return mgr
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, blocks, axes, do_integrity_check)
147
148 if do_integrity_check:
--> 149 self._verify_integrity()
150
151 # Populate known_consolidate, blknos, and blklocs lazily
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in _verify_integrity(self)
326 raise construction_error(tot_items, block.shape[1:], self.axes)
327 if len(self.items) != tot_items:
--> 328 raise AssertionError(
329 "Number of manager items must equal union of "
330 f"block items\n# manager items: {len(self.items)}, # "
AssertionError: Number of manager items must equal union of block items
# manager items: 42, # tot_items: 44
Is this something to do with my data pre-processing or is my code wrong here? Relatively new to coding so apologies if the wording of my questions are off. The table shape is 37679,44.

It might be because of duplicates among the columns.
The duplicate columns can be identified using data.columns.duplicated().

appending json files in python

I am trying to append some json files in python. I have the following code. It seems right. However, I am getting an error.
The code is as follows.
import pandas as pd
df1=pd.DataFrame()
for i in range(0,49):
df = pd.read_json ('/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
df1.append(df.T)
The error is as follows.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-76-ddb355627155> in <module>
3 df1=pd.DataFrame()
4 for i in range(0,49):
----> 5 df = pd.read_json ('/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
6 df1.append(df.T)
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines, chunksize, compression)
590 return json_reader
591
--> 592 result = json_reader.read()
593 if should_close:
594 try:
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in read(self)
715 obj = self._get_object_parser(self._combine_lines(data.split("\n")))
716 else:
--> 717 obj = self._get_object_parser(self.data)
718 self.close()
719 return obj
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in _get_object_parser(self, json)
737 obj = None
738 if typ == "frame":
--> 739 obj = FrameParser(json, **kwargs).parse()
740
741 if typ == "series" or obj is None:
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in parse(self)
847
848 else:
--> 849 self._parse_no_numpy()
850
851 if self.obj is None:
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in _parse_no_numpy(self)
1091 if orient == "columns":
1092 self.obj = DataFrame(
-> 1093 loads(json, precise_float=self.precise_float), dtype=None
1094 )
1095 elif orient == "split":
ValueError: Expected object or value
The code works when I do it for each file individually. Would anyone be able to help me regarding this.
Thanks & Best Regards
Michael

The error occurs on a df = pd.read_json (...) line. It is likely that one of the file is non existent or incorrect. My advice is to use a try catch to identify it:
for i in range(0,49):
try:
df = pd.read_json ('/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
except:
print('Error on iteration', i, ', file',
'/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
raise
df1.append(df.T)
Catching any exception is normally bad practice because it can hide truely abnormal conditions like an IO or memory error. That is the reason why I re-raise the original exception in above code.

KeyError in `plotnine` (ggplot wrapper for python)

I am trying to use plotnine to build graphs and I keep coming across the same KeyError problem when I want to plot just the x-axis. See the traceback error below.
A sample of my data is:
WORD TAG TOPIC Value
0 hey aa 1 234
1 working bb 1 123
2 lullaby cc 2 32
3 Doggy cc 2 63
4 document aa 3 84
sample of my code:
from plotnine import *
import pandas as pd
inFile = 'infile.csv'
df = pd.read_csv(inFile, names = ['WORD', 'TAG','TOPIC','VALUE'], header=0,sep='\t')
df.sort_values('value',ascending=False)
sortedDf = df[:5]
plot1 = ggplot(sortedDf) + aes(x='TOPIC') + geom_histogram(binwidth=3)
where the final goal is to plot the count of each topic in a histogram.
I am not sure what data is missing that is raising the following key error, as there is no need for a weight as I am only interested in plotting the count of that one particular variable, ie. topic 1 = 2, topic 2= 2, topic 3 = 1.
Does anyone have any link to more detailled documentation of plotline or any experience with the library to help me understand more in detail what I am missing.
Traceback Error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-112-71707b4cf21a> in <module>()
1 plot2 = ggplot(sortedDf) + aes(x='TOPIC') + geom_histogram(binwidth=3)
----> 2 print plot2
/Users/anaconda/lib/python2.7/site-packages/plotnine/ggplot.pyc in __repr__(self)
82 Print/show the plot
83 """
---> 84 self.draw()
85 plt.show()
86 return '<ggplot: (%d)>' % self.__hash__()
/Users/anaconda/lib/python2.7/site-packages/plotnine/ggplot.pyc in draw(self)
139 # assign a default theme
140 self = deepcopy(self)
--> 141 self._build()
142
143 # If no theme we use the default
/Users/anaconda/lib/python2.7/site-packages/plotnine/ggplot.pyc in _build(self)
235
236 # Apply and map statistics
--> 237 layers.compute_statistic(layout)
238 layers.map_statistic(self)
239
/Users/anaconda/lib/python2.7/site-packages/plotnine/layer.pyc in compute_statistic(self, layout)
92 def compute_statistic(self, layout):
93 for l in self:
---> 94 l.compute_statistic(layout)
95
96 def map_statistic(self, plot):
/Users/anaconda/lib/python2.7/site-packages/plotnine/layer.pyc in compute_statistic(self, layout)
369 data = self.stat.use_defaults(data)
370 data = self.stat.setup_data(data)
--> 371 data = self.stat.compute_layer(data, params, layout)
372 self.data = data
373
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat.pyc in compute_layer(cls, data, params, layout)
194 return cls.compute_panel(pdata, pscales, **params)
195
--> 196 return groupby_apply(data, 'PANEL', fn)
197
198 #classmethod
/Users/anaconda/lib/python2.7/site-packages/plotnine/utils.pyc in groupby_apply(df, cols, func, *args, **kwargs)
615 # do not mark d as a slice of df i.e no SettingWithCopyWarning
616 d.is_copy = None
--> 617 lst.append(func(d, *args, **kwargs))
618 return pd.concat(lst, axis=axis, ignore_index=True)
619
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat.pyc in fn(pdata)
192 return pdata
193 pscales = layout.get_scales(pdata['PANEL'].iat[0])
--> 194 return cls.compute_panel(pdata, pscales, **params)
195
196 return groupby_apply(data, 'PANEL', fn)
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat.pyc in compute_panel(cls, data, scales, **params)
221 for _, old in data.groupby('group'):
222 old.is_copy = None
--> 223 new = cls.compute_group(old, scales, **params)
224 unique = uniquecols(old)
225 missing = unique.columns.difference(new.columns)
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/stat_bin.pyc in compute_group(cls, data, scales, **params)
107 new_data = assign_bins(
108 data['x'], breaks, data.get('weight'),
--> 109 params['pad'], params['closed'])
110 return new_data
/Users/anaconda/lib/python2.7/site-packages/plotnine/stats/binning.pyc in assign_bins(x, breaks, weight, pad, closed)
163 df = pd.DataFrame({'bin_idx': bin_idx, 'weight': weight})
164 wftable = df.pivot_table(
--> 165 'weight', index=['bin_idx'], aggfunc=np.sum)['weight']
166
167 # Empty bins get no value in the computed frequency table.
/Users/anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
601 result = self.index.get_value(self, key)
602
--> 603 if not is_scalar(result):
604 if is_list_like(result) and not isinstance(result, Series):
605
/Users/anaconda/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_value(self, series, key)
pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3557)()
pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3240)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4363)()
KeyError: 'weight'

Nesting aes in ggplot like it is done in R may solve your issue:
plot1 = ggplot(sortedDf, aes(x='TOPIC')) + geom_histogram(binwidth=3)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

creating numpy array from matrix with header - python

Related

.describe() and .info() not working for me in Jupyter Notebook

Scipy Fourier Transform KeyError: 'ALIGNED'?

ValueError: Shape of passed values is (37679, 43), indices imply (37679, 41)

appending json files in python

KeyError in `plotnine` (ggplot wrapper for python)

Categories

Resources