ValueError: 2 columns passed, passed data had 1170 columns - python

When I try to import a geojson file and convert it into a dataframe, the issue appears, I want a dataframe with columns: Zipcode, Latitude, Longitude. Here are my codes:
import urllib.request, json
import pandas as pd
with urllib.request.urlopen("http://bostonopendata-boston.opendata.arcgis.com/datasets/53ea466a189b4f43b3dfb7b38fa7f3b6_1.geojson") as url:
wuppertal_data = json.loads(url.read().decode())
neighborhoods_data = wuppertal_data['features']
results = pd.DataFrame()
for data in neighborhoods_data:
zipcode = data['properties']['ZIP5']
temp_df = pd.DataFrame(data['geometry']['coordinates'])
temp_df = temp_df.T
temp_df = pd.DataFrame(temp_df.iloc[:,0].tolist(), columns=['Latitude', 'Longitude'])
temp_df['Zipcode'] = zipcode
results = results.append(temp_df).reset_index(drop=True)
Result:
AssertionError Traceback (most recent call last)
D:\PYTHON3.7\lib\site-packages\pandas\core\internals\construction.py in _list_to_arrays(data, columns, coerce_float, dtype)
496 result = _convert_object_array(
--> 497 content, columns, dtype=dtype, coerce_float=coerce_float
498 )
D:\PYTHON3.7\lib\site-packages\pandas\core\internals\construction.py in _convert_object_array(content, columns, coerce_float, dtype)
580 raise AssertionError(
--> 581 f"{len(columns)} columns passed, passed data had "
582 f"{len(content)} columns"
AssertionError: 2 columns passed, passed data had 1170 columns
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-82-b1c5869e9ca3> in <module>
14 temp_df = pd.DataFrame(data['geometry']['coordinates'])
15 temp_df = temp_df.T
---> 16 temp_df = pd.DataFrame(temp_df.iloc[:,0].tolist(), columns=['Latitude', 'Longitude'])
17
18 temp_df['Neighborhood'] = neighborhood_name
D:\PYTHON3.7\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
472 if is_named_tuple(data[0]) and columns is None:
473 columns = data[0]._fields
--> 474 arrays, columns = to_arrays(data, columns, dtype=dtype)
475 columns = ensure_index(columns)
476
D:\PYTHON3.7\lib\site-packages\pandas\core\internals\construction.py in to_arrays(data, columns, coerce_float, dtype)
459 return [], [] # columns if columns is not None else []
460 if isinstance(data[0], (list, tuple)):
--> 461 return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
462 elif isinstance(data[0], abc.Mapping):
463 return _list_of_dict_to_arrays(
D:\PYTHON3.7\lib\site-packages\pandas\core\internals\construction.py in _list_to_arrays(data, columns, coerce_float, dtype)
498 )
499 except AssertionError as e:
--> 500 raise ValueError(e) from e
501 return result
502
ValueError: 2 columns passed, passed data had 1170 columns
I don't quite understand the error. Can anyone help me out? I don't know what part is wrong.

The problem was, that temp_df.iloc[:,0] sometimes had more than 2 columns - in which case it was throwing an error, since you indexed only 2 of them - so to limit number of read columns from pd.Series to 2 just do: temp_df.iloc[:,0].str[:2] instead.
Full code:
import urllib.request, json
import pandas as pd
with urllib.request.urlopen("http://bostonopendata-boston.opendata.arcgis.com/datasets/53ea466a189b4f43b3dfb7b38fa7f3b6_1.geojson") as url:
wuppertal_data = json.loads(url.read().decode())
neighborhoods_data = wuppertal_data['features']
results = pd.DataFrame()
for data in neighborhoods_data:
zipcode = data['properties']['ZIP5']
temp_df = pd.DataFrame(data['geometry']['coordinates'])
temp_df = temp_df.T
temp_df = pd.DataFrame(temp_df.iloc[:,0].str[:2].tolist(), columns=['Latitude', 'Longitude'])
temp_df['Zipcode'] = zipcode
results = results.append(temp_df).reset_index(drop=True)

Related

.describe() and .info() not working for me in Jupyter Notebook

I am trying to use the describe method to get summary statistics of my data but I keep on getting this error message. Anyway to sort this out? The .info() is also giving me the same problem.
TypeError Traceback (most recent call last)
<ipython-input-28-614cd2726f37> in <module>
----> 1 players_final.describe()
~\anaconda3\lib\site-packages\pandas\core\generic.py in describe(self, percentiles, include, exclude)
10265 elif (include is None) and (exclude is None):
10266 # when some numerics are found, keep only numerics
> 10267 data = self.select_dtypes(include=[np.number])
10268 if len(data.columns) == 0:
10269 data = self
~\anaconda3\lib\site-packages\pandas\core\frame.py in select_dtypes(self, include, exclude)
3420 # the "union" of the logic of case 1 and case 2:
3421 # we get the included and excluded, and return their logical and
-> 3422 include_these = Series(not bool(include), index=self.columns)
3423 exclude_these = Series(not bool(exclude), index=self.columns)
3424
~\anaconda3\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
309 data = data.copy()
310 else:
--> 311 data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True)
312
313 data = SingleBlockManager(data, index, fastpath=True)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
710 value = maybe_cast_to_datetime(value, dtype)
711
--> 712 subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype)
713
714 else:
~\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in construct_1d_arraylike_from_scalar(value, length, dtype)
1231 value = ensure_str(value)
1232
-> 1233 subarr = np.empty(length, dtype=dtype)
1234 subarr.fill(value)
1235
TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type
​

ValueError: Shape of passed values is (37679, 43), indices imply (37679, 41)

I am trying to group horse data by races. I am using pivot function to try do this, but I keep getting a Value error.
def group_horse_and_result(element):
if element[0] == 'placing':
return 100 + element[1]
else:
return element[1]
data = data.pivot(index='id', columns='barrier', values=data.columns[2:])
rearranged_columns = sorted(list(data.columns.values), key=group_horse_and_result)
data = data[rearranged_columns]
print(data.head())
data.fillna(0)
And I keep getting this error result:
AssertionError Traceback (most recent call last)
<ipython-input-253-97da160dc172> in <module>
5 return element[1]
6
----> 7 data = data.pivot(index='race_id', columns='placing', values=data.columns[2:])
8 rearranged_columns = sorted(list(data.columns.values), key=group_horse_and_result)
9 data = data[rearranged_columns]
~\anaconda3\lib\site-packages\pandas\core\frame.py in pivot(self, index, columns, values)
6672 from pandas.core.reshape.pivot import pivot
6673
-> 6674 return pivot(self, index=index, columns=columns, values=values)
6675
6676 _shared_docs[
~\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in pivot(data, index, columns, values)
470 # Exclude tuple because it is seen as a single column name
471 values = cast(Sequence[Label], values)
--> 472 indexed = data._constructor(
473 data[values]._values, index=index, columns=values
474 )
~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
495 mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
496 else:
--> 497 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
498
499 # For data is list-like, or Iterable (will consume into list)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_ndarray(values, index, columns, dtype, copy)
232 block_values = [values]
233
--> 234 return create_block_manager_from_blocks(block_values, [columns, index])
235
236
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
1663 ]
1664
-> 1665 mgr = BlockManager(blocks, axes)
1666 mgr._consolidate_inplace()
1667 return mgr
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, blocks, axes, do_integrity_check)
147
148 if do_integrity_check:
--> 149 self._verify_integrity()
150
151 # Populate known_consolidate, blknos, and blklocs lazily
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in _verify_integrity(self)
326 raise construction_error(tot_items, block.shape[1:], self.axes)
327 if len(self.items) != tot_items:
--> 328 raise AssertionError(
329 "Number of manager items must equal union of "
330 f"block items\n# manager items: {len(self.items)}, # "
AssertionError: Number of manager items must equal union of block items
# manager items: 42, # tot_items: 44
Is this something to do with my data pre-processing or is my code wrong here? Relatively new to coding so apologies if the wording of my questions are off. The table shape is 37679,44.
It might be because of duplicates among the columns.
The duplicate columns can be identified using data.columns.duplicated().

How to use .map on an integer column in python pandas

I'm trying to take an integer column and map discrete values to another column. Basically, if a credit tier is marked, 1, 2, 3, antoher column maps those to no credit state, no hit or thin files. Then fill the null values with vaild. I tried However, I keep getting this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-129-926e6625f2b6> in <module>
1 #train.dtypes
----> 2 df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6012 args=args,
6013 kwds=kwds)
-> 6014 return op.get_result()
6015
6016 def applymap(self, func):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
140 return self.apply_raw()
141
--> 142 return self.apply_standard()
143
144 def apply_empty_result(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
246
247 # compute the result using the series generator
--> 248 self.apply_series_generator()
249
250 # wrap results
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
275 try:
276 for i, v in enumerate(series_gen):
--> 277 results[i] = self.f(v)
278 keys.append(v.name)
279 except Exception as e:
<ipython-input-129-926e6625f2b6> in <lambda>(row)
1 #train.dtypes
----> 2 df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
<ipython-input-126-462888d46184> in discrete_credit(row, variable)
6
7 """
----> 8 score = row[variable].map({1:'no_credit_state', 2:'thin_file', 3:"no_hit"})
9 score = row[score].fillna('valid')
10 score = pd.Categorical(row[score], ['valid', 'no_credit_state','thin_file', 'no_hit'])
AttributeError: ("'numpy.int64' object has no attribute 'map'", 'occurred at index 0')
Here is a code example that is throwing the same error:
import pandas as pd
credit = {'credit_52278':[1,2,3,500,550,600,650,700,750,800,900]
}
df = pd.DataFrame(credit)
def discrete_credit(row, variable):
"""
allows thin files, no hits and no credit scores to float which will then allow the rest of the credit score to be fit \
with a spline
"""
score = row[variable].map({1:'no_credit_state', 2:'thin_file', 3:"no_hit"})
score = row[score].fillna('valid')
score = pd.Categorical(row[score], ['valid', 'no_credit_state','thin_file', 'no_hit'])
return score
df['discrete_52278'] = df.apply(lambda row: discrete_credit(row, 'credit_52278'), axis = 1)
map is a Series method, but you are trying to use it on a scalar (float) value.
You could simply do something like:
df['discrete_52278'] = (
df['credit_52278']
.map({
1: 'no_credit_state',
2: 'thin_file',
3: 'no_hit'
})
.fillna('valid')
.astype('category')
)

Pandas error when filtering rows based on multiple column conditionals, "ValueError: invalid literal for int() with base 10: "

I am getting a 'ValueError: invalid literal for int() with base 10: ' when I try to filter the dataframe using multiple column conditions
Here is the code to set up the pandas dataframe. Warning: it'll download 6 mb of data. Can run in Google Colab if too concerned.
Code to import stuff and download the data
#Import stuff
import re
import os
import zipfile
from urllib.request import urlretrieve
from os.path import isfile, isdir
import requests
#Define Download Function
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
#Download data
download_file_from_google_drive('1sZk3WWgdyHLru7q1KSWQwCT4nwwzHlpY', 'TheAnimeList.csv')
Code to set up the pandas dataframe
download_file_from_google_drive('1sZk3WWgdyHLru7q1KSWQwCT4nwwzHlpY', 'TheAnimeList.csv')
animeuser = pd.read_csv('TheAnimeList.csv' )
animeuser = animeuser[['anime_id','title_english', 'popularity', 'rank']]
animeuser.head()
anime_id title_english popularity rank
0 11013 Inu X Boku Secret Service 231 1274.0
1 2104 My Bride is a Mermaid 366 727.0
2 5262 Shugo Chara!! Doki 1173 1508.0
3 721 Princess Tutu 916 307.0
4 12365 Bakuman. 426 50.0
I am trying to filter rows based on column conditionals. First I tried
animeuser = animeuser[ (animeuser.popularity >= 3000) | (animeuser.rank >= 3000) ]
But that gave me this error
TypeError Traceback (most recent call last)
<ipython-input-39-8fb6d8508f25> in <module>()
----> 1 animeuser = animeuser[ (animeuser.popularity >= 3000) | (animeuser.rank >= 3000) ]
TypeError: '>=' not supported between instances of 'method' and 'int'
Then I tried
animeuser = animeuser[ ( animeuser.astype(int)['popularity'] >= 3000 ) | ( animeuser.astype(int)['rank'] >= 3000 ) ]
But that gave me this error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-40-a2ea65786b2a> in <module>()
----> 1 animeuser = animeuser[ ( animeuser.astype(int)['popularity'] >= 3000 ) | ( animeuser.astype(int)['rank'] >= 3000 ) ]
/usr/local/lib/python3.6/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
116 else:
117 kwargs[new_arg_name] = new_arg_value
--> 118 return func(*args, **kwargs)
119 return wrapper
120 return _deprecate_kwarg
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
4002 # else, only a single dtype is given
4003 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4004 **kwargs)
4005 return self._constructor(new_data).__finalize__(self)
4006
/usr/local/lib/python3.6/dist-packages/pandas/core/internals.py in astype(self, dtype, **kwargs)
3460
3461 def astype(self, dtype, **kwargs):
-> 3462 return self.apply('astype', dtype=dtype, **kwargs)
3463
3464 def convert(self, **kwargs):
/usr/local/lib/python3.6/dist-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3327
3328 kwargs['mgr'] = self
-> 3329 applied = getattr(b, f)(**kwargs)
3330 result_blocks = _extend_blocks(applied, result_blocks)
3331
/usr/local/lib/python3.6/dist-packages/pandas/core/internals.py in astype(self, dtype, copy, errors, values, **kwargs)
542 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
543 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 544 **kwargs)
545
546 def _astype(self, dtype, copy=False, errors='raise', values=None,
/usr/local/lib/python3.6/dist-packages/pandas/core/internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
623
624 # _astype_nansafe works fine with 1-d only
--> 625 values = astype_nansafe(values.ravel(), dtype, copy=True)
626 values = values.reshape(self.shape)
627
/usr/local/lib/python3.6/dist-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy)
690 elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
691 # work around NumPy brokenness, #1987
--> 692 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
693
694 if dtype.name in ("datetime64", "timedelta64"):
pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()
pandas/_libs/src/util.pxd in util.set_value_at_unsafe()
ValueError: invalid literal for int() with base 10: 'Inu X Boku Secret Service'
The string 'Inu X Boku Secret Service' belongs to the 'title_english' column in the very first row of the dataframe. But the 'rank' and 'popularity' columns see to be float and ints.
I even tried looking at the datatypes
animeuser.dtypes
anime_id int64
title_english object
popularity int64
rank float64
dtype: object
And everything seems to be in order.
The first error you are facing is because rank is a method of pandas.DataFrame. Methods have precedence over column access via attribute notation. So in order to access the data you need to use bracket notation: animeuser['rank'].
The second error occurs because you try to represent the whole data frame as int which is not possible for various columns. This would only be possible for the 'rank' and 'popularity' columns.
With statement
animeuser.astype(int)['popularity']
you trying to convert to int all animeuser columns. And got an error on string column. Try just
animeuser['popularity']

Received "ValueError: If using all scalar values, you must pass an index" in Python

I have run the following code on python in order to retrieve various crypto currency closing prices from their inception. I have run it successfully using the following tickers:
tickers = ['USDT_BTC','USDT_BCH','USDT_ETC','USDT_XMR','USDT_ETH','USDT_DASH',
'USDT_XRP','USDT_LTC','USDT_NXT','USDT_STR','USDT_REP','USDT_ZEC']
I now have changed it as follows (full code included) and get a ValueError.
[LN1]
def CryptoDataCSV(symbol, frequency):
#Params: String symbol, int frequency = 300,900,1800,7200,14400,86400
#Returns: df from first available date
url ='https://poloniex.com/public?command=returnChartData&currencyPair='+symbol+'&end=9999999999&period='+str(frequency)+'&start=0'
df = pd.read_json(url)
df.set_index('date',inplace=True)
df.to_csv(symbol + '.csv')
print('Processed: ' + symbol)
[LN2]
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
[LN3]
tickers = 'ETH_BTC','STR_BTC','XMR_BTC','XRP_BTC','LTC_BTC','DASH_BTC',
'ETC_BTC','POT_BTC','OMG_BTC','FCT_BTC','ZEC_BTC','BTS_BTC','VTC_BTC',
'XEM_BTC','MAID_BTC','DGB_BTC','STRAT_BTC','LSK_BTC','XVC_BTC','SC_BTC',
'DOGE_BTC','XBC_BTC','GNT_BTC','EMC2_BTC','CLAM_BTC','RIC_BTC','SYS_BTC',
'DCR_BTC','STEEM_BTC','ZRX_BTC','GAME_BTC','VIA_BTC','NXC_BTC','NXT_BTC'
,'VRC_BTC','NAV_BTC','PINK_BTC','STORJ_BTC','ARDR_BTC','BCN_BTC','CVC_BTC',
'EXP_BTC','LBC_BTC','GNO_BTC','GAS_BTC','OMNI_BTC','XCP_BTC','NEOS_BTC',
'BURST_BTC','AMP_BTC','FLDC_BTC','FLO_BTC','SBD_BTC','BLK_BTC','BTCD_BTC',
'NOTE_BTC','GRC_BTC','PPC_BTC','BTM_BTC','XPM_BTC','NMC_BTC','PASC_BTC',
'NAUT_BTC','BELA_BTC','SJCX_BTC','HUC_BTC','RADS_BTC']
[LN4]
for ticker in tickers:
CryptoDataCSV(ticker, 86400)
I now get the following error:
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) in ()
1 for ticker in tickers:
----> 2 CryptoDataCSV(ticker, 86400)
in CryptoDataCSV(symbol, frequency)
7 url ='https://poloniex.com/public?command=returnChartData&currencyPair='+symbol+'&end=9999999999&period='+str(frequency)+'&start=0'
8
----> 9 df = pd.read_json(url)
10
11 df.set_index('date',inplace=True)
~\Anaconda3\lib\site-packages\pandas\io\json\json.py in
read_json(path_or_buf, orient, typ, dtype, convert_axes,
convert_dates, keep_default_dates, numpy, precise_float, date_unit,
encoding, lines)
352 obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
353 keep_default_dates, numpy, precise_float,
--> 354 date_unit).parse()
355
356 if typ == 'series' or obj is None:
~\Anaconda3\lib\site-packages\pandas\io\json\json.py in parse(self)
420
421 else:
--> 422 self._parse_no_numpy()
423
424 if self.obj is None:
~\Anaconda3\lib\site-packages\pandas\io\json\json.py in
_parse_no_numpy(self)
637 if orient == "columns":
638 self.obj = DataFrame(
--> 639 loads(json, precise_float=self.precise_float), dtype=None)
640 elif orient == "split":
641 decoded = dict((str(k), v)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in init(self,
data, index, columns, dtype, copy)
273 dtype=dtype, copy=copy)
274 elif isinstance(data, dict):
--> 275 mgr = self._init_dict(data, index, columns, dtype=dtype)
276 elif isinstance(data, ma.MaskedArray):
277 import numpy.ma.mrecords as mrecords
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _init_dict(self,
data, index, columns, dtype)
409 arrays = [data[k] for k in keys]
410
--> 411 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
412
413 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in
_arrays_to_mgr(arrays, arr_names, index, columns, dtype) 5494 # figure out the index, if necessary 5495 if index is None:
-> 5496 index = extract_index(arrays) 5497 else: 5498 index = _ensure_index(index)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in
extract_index(data) 5533 5534 if not indexes and not
raw_lengths:
-> 5535 raise ValueError('If using all scalar values, you must pass' 5536 ' an index') 5537
ValueError: If using all scalar values, you must pass an index
I just tested your data, and it appears that some of your currency pairs do not work at all, returning a json of the form:
{"error":"Invalid currency pair."}
When this is returned, pd.read_json throws an error, because it can't convert this to a dataframe.
The simplest workaround is to use a try-except brace and handle any non-working tickers.
broken_tickers = []
for t in tickers:
url ='https://poloniex.com/public?command=returnChartData&currencyPair={}&end=9999999999&period={}&start=0'.format(t, 86400)
try:
df = pd.read_json(url)
except ValueError:
broken_tickers.append(t)
continue
df.set_index('date')
df.to_csv('{}.csv'.format(t))
I've gotten rid of the function, I didn't really feel it necessary here but you can add it back in.

Categories

Resources