appending json files in python - python

I am trying to append some json files in python. I have the following code. It seems right. However, I am getting an error.
The code is as follows.
import pandas as pd
df1=pd.DataFrame()
for i in range(0,49):
df = pd.read_json ('/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
df1.append(df.T)
The error is as follows.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-76-ddb355627155> in <module>
3 df1=pd.DataFrame()
4 for i in range(0,49):
----> 5 df = pd.read_json ('/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
6 df1.append(df.T)
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines, chunksize, compression)
590 return json_reader
591
--> 592 result = json_reader.read()
593 if should_close:
594 try:
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in read(self)
715 obj = self._get_object_parser(self._combine_lines(data.split("\n")))
716 else:
--> 717 obj = self._get_object_parser(self.data)
718 self.close()
719 return obj
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in _get_object_parser(self, json)
737 obj = None
738 if typ == "frame":
--> 739 obj = FrameParser(json, **kwargs).parse()
740
741 if typ == "series" or obj is None:
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in parse(self)
847
848 else:
--> 849 self._parse_no_numpy()
850
851 if self.obj is None:
~/myenv/lib/python3.5/site-packages/pandas/io/json/_json.py in _parse_no_numpy(self)
1091 if orient == "columns":
1092 self.obj = DataFrame(
-> 1093 loads(json, precise_float=self.precise_float), dtype=None
1094 )
1095 elif orient == "split":
ValueError: Expected object or value
The code works when I do it for each file individually. Would anyone be able to help me regarding this.
Thanks & Best Regards
Michael

The error occurs on a df = pd.read_json (...) line. It is likely that one of the file is non existent or incorrect. My advice is to use a try catch to identify it:
for i in range(0,49):
try:
df = pd.read_json ('/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
except:
print('Error on iteration', i, ', file',
'/media/michael/extHDD/Kaggle/DeepFAke/DF_all/metadata{}.json'.format(i))
raise
df1.append(df.T)
Catching any exception is normally bad practice because it can hide truely abnormal conditions like an IO or memory error. That is the reason why I re-raise the original exception in above code.

Related

.describe() and .info() not working for me in Jupyter Notebook

I am trying to use the describe method to get summary statistics of my data but I keep on getting this error message. Anyway to sort this out? The .info() is also giving me the same problem.
TypeError Traceback (most recent call last)
<ipython-input-28-614cd2726f37> in <module>
----> 1 players_final.describe()
~\anaconda3\lib\site-packages\pandas\core\generic.py in describe(self, percentiles, include, exclude)
10265 elif (include is None) and (exclude is None):
10266 # when some numerics are found, keep only numerics
> 10267 data = self.select_dtypes(include=[np.number])
10268 if len(data.columns) == 0:
10269 data = self
~\anaconda3\lib\site-packages\pandas\core\frame.py in select_dtypes(self, include, exclude)
3420 # the "union" of the logic of case 1 and case 2:
3421 # we get the included and excluded, and return their logical and
-> 3422 include_these = Series(not bool(include), index=self.columns)
3423 exclude_these = Series(not bool(exclude), index=self.columns)
3424
~\anaconda3\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
309 data = data.copy()
310 else:
--> 311 data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True)
312
313 data = SingleBlockManager(data, index, fastpath=True)
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
710 value = maybe_cast_to_datetime(value, dtype)
711
--> 712 subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype)
713
714 else:
~\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in construct_1d_arraylike_from_scalar(value, length, dtype)
1231 value = ensure_str(value)
1232
-> 1233 subarr = np.empty(length, dtype=dtype)
1234 subarr.fill(value)
1235
TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type
​

could not convert string to float - object type

I'm working with a dataframe in Python using Pandas and Jupyter Notebook, and my dataframe has Longitude and Latitude columns with values like '-23,4588'. Somehow, everytime I try to convert it to float, I get an error telling 'could not convert string to float'.
I tried to change the comma, tried to change the .csv column type to float, but nothing works.
A part of my code:
ValueError Traceback (most recent call last)
C:\TEMP/ipykernel_12640/4061618161.py in <module>
----> 1 newocorr_sjc['Latitude'] = newocorr_sjc['Latitude'].astype(float)
c:\users\caique.fernandes\appdata\local\programs\python\python39\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors)
5875 else:
5876 # else, only a single dtype is given
-> 5877 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
5878 return self._constructor(new_data).__finalize__(self, method="astype")
5879
c:\users\caique.fernandes\appdata\local\programs\python\python39\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, copy, errors)
629 self, dtype, copy: bool = False, errors: str = "raise"
630 ) -> "BlockManager":
--> 631 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
632
633 def convert(
c:\users\caique.fernandes\appdata\local\programs\python\python39\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
425 applied = b.apply(f, **kwargs)
426 else:
--> 427 applied = getattr(b, f)(**kwargs)
428 except (TypeError, NotImplementedError):
429 if not ignore_failures:
c:\users\caique.fernandes\appdata\local\programs\python\python39\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors)
671 vals1d = values.ravel()
672 try:
--> 673 values = astype_nansafe(vals1d, dtype, copy=True)
674 except (ValueError, TypeError):
675 # e.g. astype_nansafe can fail on object-dtype of strings
c:\users\caique.fernandes\appdata\local\programs\python\python39\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
1095 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
1096 # Explicit copy, or required since NumPy can't view from / to object.
-> 1097 return arr.astype(dtype, copy=True)
1098
1099 return arr.view(dtype)
ValueError: could not convert string to float: '-23,5327'```
Maybe you should use decimal=',' as argument of pd.read_csv:
df = pd.read_csv('data.csv', sep=';', decimal=',')
>>> df.select_dtypes(float)
17 22 23
0 17.5 -23.5327 -46.8182
1 56.3 -23.4315 -47.1269

I'm trouble in error of pandas "DataFrame"

I'm a bigginer who are learning pandas.
I tried two things as follows, but I didn't solve it.
I use Jupyter notebook.
Could you please help me?
#1
import sys
sys.getdefaultencoding()
#2
from importlib import reload
import sys
reload(sys)
#I wanted to make it.
#I succeeded in it at another file, but it couldn't work at current file.
from pandas import Series,DataFrame
import pandas as pd
data={"ID":["001","002","003"],
"city":["hyougo","tiba","gihu"],
"people":["100","230","249"]}
data_frame=DataFrame(data)
print(data_frame)
#error
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-133-640865466ed4> in <module>
3 "people":["100","230","249"]}
4 data_frame=DataFrame(data)
----> 5 print(data_frame)
~\anaconda3\lib\site-packages\pandas\core\frame.py in __repr__(self)
678 else:
679 width = None
--> 680 self.to_string(
681 buf=buf,
682 max_rows=max_rows,
~\anaconda3\lib\site-packages\pandas\core\frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, min_rows, max_cols, show_dimensions, decimal, line_width, max_colwidth, encoding)
818 line_width=line_width,
819 )
--> 820 return formatter.to_string(buf=buf, encoding=encoding)
821
822 # ----------------------------------------------------------------------
~\anaconda3\lib\site-packages\pandas\io\formats\format.py in to_string(self, buf, encoding)
912 encoding: Optional[str] = None,
913 ) -> Optional[str]:
--> 914 return self.get_result(buf=buf, encoding=encoding)
915
916 def to_latex(
~\anaconda3\lib\site-packages\pandas\io\formats\format.py in get_result(self, buf, encoding)
519 """
520 with self.get_buffer(buf, encoding=encoding) as f:
--> 521 self.write_result(buf=f)
522 if buf is None:
523 return f.getvalue()
~\anaconda3\lib\site-packages\pandas\io\formats\format.py in write_result(self, buf)
821 else:
822
--> 823 strcols = self._to_str_columns()
824 if self.line_width is None: # no need to wrap around just print
825 # the whole frame
~\anaconda3\lib\site-packages\pandas\io\formats\format.py in _to_str_columns(self)
717 # may include levels names also
718
--> 719 str_index = self._get_formatted_index(frame)
720
721 if not is_list_like(self.header) and not self.header:
~\anaconda3\lib\site-packages\pandas\io\formats\format.py in _get_formatted_index(self, frame)
1057 )
1058 else:
-> 1059 fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)]
1060
1061 fmt_index = [
AttributeError: 'list' object has no attribute 'format'
You need to import pandas first and then add the alias pd before DataFrame
import pandas as pd
data={"ID":["001","002","003"],
"city":["hyougo","tiba","gihu"],
"people":["100","230","249"]}
data_frame=pd.DataFrame(data)
print(data_frame)
Prints:
ID city people
0 001 hyougo 100
1 002 tiba 230
2 003 gihu 249
If you don't have pandas installed you will need to do that first
pip install pandas

not able to make a DataFrame with yFinance JSON values

I am trying to make a data frame with some of the information I received from yFinance.info. I have a list of s&p 500 stock symbols, and I made a for loop using stocks' symbols to retrieve data
for sym in symbol:
x=yf.Ticker(sym)
sector.append(x.info['forwardPE'])
However, every time I run it, it runs for a very long time and returns this error.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-13-c87646d48ecd> in <module>
12 for sym in symbol:
13 x=yf.Ticker(sym)
---> 14 sector.append(x.info['forwardPE'])
15
~/opt/anaconda3/lib/python3.7/site-packages/yfinance/ticker.py in info(self)
136 #property
137 def info(self):
--> 138 return self.get_info()
139
140 #property
~/opt/anaconda3/lib/python3.7/site-packages/yfinance/base.py in get_info(self, proxy, as_dict, *args, **kwargs)
444
445 def get_info(self, proxy=None, as_dict=False, *args, **kwargs):
--> 446 self._get_fundamentals(proxy)
447 data = self._info
448 if as_dict:
~/opt/anaconda3/lib/python3.7/site-packages/yfinance/base.py in _get_fundamentals(self, kind, proxy)
283 # holders
284 url = "{}/{}/holders".format(self._scrape_url, self.ticker)
--> 285 holders = _pd.read_html(url)
286
287 if len(holders)>=3:
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)
1098 na_values=na_values,
1099 keep_default_na=keep_default_na,
-> 1100 displayed_only=displayed_only,
1101 )
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
913 break
914 else:
--> 915 raise retained
916
917 ret = []
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
893
894 try:
--> 895 tables = p.parse_tables()
896 except ValueError as caught:
897 # if `io` is an io-like object, check if it's seekable
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in parse_tables(self)
211 list of parsed (header, body, footer) tuples from tables.
212 """
--> 213 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
214 return (self._parse_thead_tbody_tfoot(table) for table in tables)
215
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in _parse_tables(self, doc, match, attrs)
543
544 if not tables:
--> 545 raise ValueError("No tables found")
546
547 result = []
ValueError: No tables found
When I do it without the append (eg."x.info['forwardPE']), it runs fine and return values one by one. Can anybody please help me with how I could fix this problem? Sorry for the horrible summarization and thank you in advance.
You could put the line in a try block and except the errors to see which symbols aren't working properly. Since you have 500 tickers to go through, you may encounter more than one exception so I'd recommend using a broad except Exception statement and using traceback (optional) to get more info on the error
import traceback
import yfinance as yf
symbol = ['TSLA', 'F', 'MNQ', 'MMM']
sector = []
for sym in symbol:
try:
x = yf.Ticker(sym)
sector.append(x.info['forwardPE'])
except Exception as error:
print()
print(f'{error} for symbol {sym}')
print(traceback.format_exc())
print(sector)

df.columns is giving ValueError: in pyspark

I am completely new in pyspark. I am getting error while executing the command
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("basics").getOrCreate()
df = spark.read.csv("data.csv",inferSchema=True,header=True)
df.columns
my data has 1,000,000 rows and 50 columns. I am getting following error.
ValueError Traceback (most recent call last)
<ipython-input-71-b666bf274d0a> in <module>
----> 1 df.columns
~/anaconda3/lib/python3.7/site-packages/pyspark/sql/dataframe.py in columns(self)
935 ['age', 'name']
936 """
--> 937 return [f.name for f in self.schema.fields]
938
939 #since(2.3)
~/anaconda3/lib/python3.7/site-packages/pyspark/sql/dataframe.py in schema(self)
253 if self._schema is None:
254 try:
--> 255 self._schema = _parse_datatype_json_string(self._jdf.schema().json())
256 except AttributeError as e:
257 raise Exception(
~/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py in _parse_datatype_json_string(json_string)
867 >>> check_datatype(complex_maptype)
868 """
--> 869 return _parse_datatype_json_value(json.loads(json_string))
870
871
~/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py in _parse_datatype_json_value(json_value)
884 tpe = json_value["type"]
885 if tpe in _all_complex_types:
--> 886 return _all_complex_types[tpe].fromJson(json_value)
887 elif tpe == 'udt':
888 return UserDefinedType.fromJson(json_value)
~/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py in fromJson(cls, json)
575 #classmethod
576 def fromJson(cls, json):
--> 577 return StructType([StructField.fromJson(f) for f in json["fields"]])
578
579 def fieldNames(self):
~/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py in <listcomp>(.0)
575 #classmethod
576 def fromJson(cls, json):
--> 577 return StructType([StructField.fromJson(f) for f in json["fields"]])
578
579 def fieldNames(self):
~/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py in fromJson(cls, json)
432 def fromJson(cls, json):
433 return StructField(json["name"],
--> 434 _parse_datatype_json_value(json["type"]),
435 json["nullable"],
436 json["metadata"])
~/anaconda3/lib/python3.7/site-packages/pyspark/sql/types.py in _parse_datatype_json_value(json_value)
880 return DecimalType(int(m.group(1)), int(m.group(2)))
881 else:
--> 882 raise ValueError("Could not parse datatype: %s" % json_value)
883 else:
884 tpe = json_value["type"]
ValueError: Could not parse datatype: decimal(6,-8)
Can anyone please help me to understand why am I getting this error and how to over come this? If I am getting the error because of wrong schema, how can I define the schema for 50 columns? TIA!
according to what you commented, use inferSchema=True and this UNTESTED code should help you out:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark = SparkSession.builder.appName("basics").getOrCreate()
df = spark.read.csv("data.csv",inferSchema=True,header=True)
for column_type in df.dtypes:
if 'string' in column_type[1]:
df = df.withColumn(column_type[0], df[column_type[0]].cast(StringType()))
elif 'double' in column_type[1]:
df = df.withColumn(column_type[0],df[column_type[0]].cast(DoubleType()))
elif 'int' in column_type[1]:
df = df.withColumn(column_type[0],df[column_type[0]].cast(IntegerType()))
elif 'bool' in column_type[1]:
df = df.withColumn(column_type[0], df[column_type[0]].cast(BooleanType()))
elif 'decimal' in column_type[1]:
df = df.withColumn(column_type[0],df[column_type[0]].cast(DoubleType()))
# add as many condiitions as you need for types
df.schema
let me know if it did it for you, if not than i'll test and update it

Categories

Resources