Python: how to read a url with ".data " suffix - python

I'm trying to read data from this url - "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data" into a pandas dataframe.
I've used this technique:
park_df = pd.read_html('https://archive.ics.uci.edu/ml/machine-learning-
databases/parkinsons/parkinsons.data', header=0, flavor='bs4')
but I get an error as shown below:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-804373f977ab> in <module>()
----> 1 park_df = pd.read_html('https://archive.ics.uci.edu/ml/machine-
learning-databases/parkinsons/parkinsons.data', header=0, flavor='bs4')
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\html.py in
read_html(io, match, flavor, header, index_col, skiprows, attrs,
parse_dates, tupleize_cols, thousands, encoding, decimal, converters,
na_values, keep_default_na, displayed_only)
985 decimal=decimal, converters=converters,
na_values=na_values,
986 keep_default_na=keep_default_na,
--> 987 displayed_only=displayed_only)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\html.py in
_parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
813 break
814 else:
--> 815 raise_with_traceback(retained)
816
817 ret = []
~\AppData\Local\Continuum\anaconda3\lib\site-
packages\pandas\compat\__init__.py in raise_with_traceback(exc, traceback)
402 if traceback == Ellipsis:
403 _, _, traceback = sys.exc_info()
--> 404 raise exc.with_traceback(traceback)
405 else:
406 # this version of raise is a syntax error in Python 3
ValueError: No tables found
Can you suggest what I'm doing wrong here, what else could be a better option. Please do open the url to check how the data looks, with the header in the 1st row (containing column names) and data following further below.

Function read_html is used for convert html tables to pandas DataFrame, for convert csv format use read_csv:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data'
df = pd.read_csv(url)
print (df.head())
name MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) \
0 phon_R01_S01_1 119.992 157.302 74.997 0.00784
1 phon_R01_S01_2 122.400 148.650 113.819 0.00968
2 phon_R01_S01_3 116.682 131.111 111.555 0.01050
3 phon_R01_S01_4 116.676 137.871 111.366 0.00997
4 phon_R01_S01_5 116.014 141.781 110.655 0.01284
MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer ... \
0 0.00007 0.00370 0.00554 0.01109 0.04374 ...
1 0.00008 0.00465 0.00696 0.01394 0.06134 ...
2 0.00009 0.00544 0.00781 0.01633 0.05233 ...
3 0.00009 0.00502 0.00698 0.01505 0.05492 ...
4 0.00011 0.00655 0.00908 0.01966 0.06425 ...
Shimmer:DDA NHR HNR status RPDE DFA spread1 \
0 0.06545 0.02211 21.033 1 0.414783 0.815285 -4.813031
1 0.09403 0.01929 19.085 1 0.458359 0.819521 -4.075192
2 0.08270 0.01309 20.651 1 0.429895 0.825288 -4.443179
3 0.08771 0.01353 20.644 1 0.434969 0.819235 -4.117501
4 0.10470 0.01767 19.649 1 0.417356 0.823484 -3.747787
spread2 D2 PPE
0 0.266482 2.301442 0.284654
1 0.335590 2.486855 0.368674
2 0.311173 2.342259 0.332634
3 0.334147 2.405554 0.368975
4 0.234513 2.332180 0.410335
[5 rows x 24 columns]

Related

not able to make a DataFrame with yFinance JSON values

I am trying to make a data frame with some of the information I received from yFinance.info. I have a list of s&p 500 stock symbols, and I made a for loop using stocks' symbols to retrieve data
for sym in symbol:
x=yf.Ticker(sym)
sector.append(x.info['forwardPE'])
However, every time I run it, it runs for a very long time and returns this error.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-13-c87646d48ecd> in <module>
12 for sym in symbol:
13 x=yf.Ticker(sym)
---> 14 sector.append(x.info['forwardPE'])
15
~/opt/anaconda3/lib/python3.7/site-packages/yfinance/ticker.py in info(self)
136 #property
137 def info(self):
--> 138 return self.get_info()
139
140 #property
~/opt/anaconda3/lib/python3.7/site-packages/yfinance/base.py in get_info(self, proxy, as_dict, *args, **kwargs)
444
445 def get_info(self, proxy=None, as_dict=False, *args, **kwargs):
--> 446 self._get_fundamentals(proxy)
447 data = self._info
448 if as_dict:
~/opt/anaconda3/lib/python3.7/site-packages/yfinance/base.py in _get_fundamentals(self, kind, proxy)
283 # holders
284 url = "{}/{}/holders".format(self._scrape_url, self.ticker)
--> 285 holders = _pd.read_html(url)
286
287 if len(holders)>=3:
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)
1098 na_values=na_values,
1099 keep_default_na=keep_default_na,
-> 1100 displayed_only=displayed_only,
1101 )
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
913 break
914 else:
--> 915 raise retained
916
917 ret = []
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
893
894 try:
--> 895 tables = p.parse_tables()
896 except ValueError as caught:
897 # if `io` is an io-like object, check if it's seekable
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in parse_tables(self)
211 list of parsed (header, body, footer) tuples from tables.
212 """
--> 213 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
214 return (self._parse_thead_tbody_tfoot(table) for table in tables)
215
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/html.py in _parse_tables(self, doc, match, attrs)
543
544 if not tables:
--> 545 raise ValueError("No tables found")
546
547 result = []
ValueError: No tables found
When I do it without the append (eg."x.info['forwardPE']), it runs fine and return values one by one. Can anybody please help me with how I could fix this problem? Sorry for the horrible summarization and thank you in advance.
You could put the line in a try block and except the errors to see which symbols aren't working properly. Since you have 500 tickers to go through, you may encounter more than one exception so I'd recommend using a broad except Exception statement and using traceback (optional) to get more info on the error
import traceback
import yfinance as yf
symbol = ['TSLA', 'F', 'MNQ', 'MMM']
sector = []
for sym in symbol:
try:
x = yf.Ticker(sym)
sector.append(x.info['forwardPE'])
except Exception as error:
print()
print(f'{error} for symbol {sym}')
print(traceback.format_exc())
print(sector)

trying to create a pivot table with pandas and numpy

I'm using the following to try and get a pivot table that multiplies the quantity times the effective price and groups it by Product name:
df_sorted.pivot_table(values=['Quantity', 'EffectivePrice'], index=['ProductName'], aggfunc=np.multiply )
This is the stack trace - not sure why this isn't working.
ValueError Traceback (most recent call last)
/usr/local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
970 try:
--> 971 result = self._aggregate_multiple_funcs([func], _axis=self.axis)
972
/usr/local/lib/python3.9/site-packages/pandas/core/base.py in _aggregate_multiple_funcs(self, arg, _axis)
544 if not len(results):
--> 545 raise ValueError("no results")
546
ValueError: no results
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-35-f616d7b46a13> in <module>
----> 1 df_sorted.pivot_table(values=['Quantity', 'EffectivePrice'], index=['ProductName'], aggfunc=np.multiply )
/usr/local/lib/python3.9/site-packages/pandas/core/frame.py in pivot_table(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed)
6824 from pandas.core.reshape.pivot import pivot_table
6825
-> 6826 return pivot_table(
6827 self,
6828 values=values,
/usr/local/lib/python3.9/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed)
110
111 grouped = data.groupby(keys, observed=observed)
--> 112 agged = grouped.agg(aggfunc)
113 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
114 agged = agged.dropna(how="all")
/usr/local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
981 # raised directly by _aggregate_multiple_funcs
982 raise
--> 983 result = self._aggregate_frame(func)
984 except AttributeError:
985 # catch exception from line 969
/usr/local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in _aggregate_frame(self, func, *args, **kwargs)
1173 if axis != obj._info_axis_number:
1174 for name, data in self:
-> 1175 fres = func(data, *args, **kwargs)
1176 result[name] = fres
1177 else:
ValueError: invalid number of arguments```
My understanding is you cannot apply multi-column operation in pivot table. Maybe using groupby + transform can do. Based on the explanation I recommend this code (I am not sure the expected result is what you want):
df_sorted['TotalPrice'] = df_sorted['Quantity']*df_sorted['EffectivePrice']
result = df_sorted.groupby('ProductName')['TotalPrice'].sum()
For this sample dataframe:
Quantity EffectivePrice ProductName
0 1 12 A
1 1 13 B
2 2 14 A
The output is like this:
ProductName
A 40
B 13

Version of raise is a syntax error in Python 3

As a rookie, I just started to use the datareader library, in particular read_html function and came across the following error when trying to get a table from websites.
import pandas as pd
from pandas_datareader import data
df_list=pd.read_html('https://www.mismarcadores.com/futbol/espana/laliga/clasificacion/')
print(len(df_list))
And I get this syntax error with raise (line 346)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-44-c546df3e8ebd> in <module>()
----> 1 df_list=pd.read_html('https://www.mismarcadores.com/futbol/espana/laliga/clasificacion/')
2 print(len(df_list))
~\Anaconda3\lib\site-packages\pandas\io\html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, tupleize_cols, thousands, encoding, decimal, converters, na_values, keep_default_na)
904 thousands=thousands, attrs=attrs, encoding=encoding,
905 decimal=decimal, converters=converters, na_values=na_values,
--> 906 keep_default_na=keep_default_na)
~\Anaconda3\lib\site-packages\pandas\io\html.py in _parse(flavor, io, match, attrs, encoding, **kwargs)
741 break
742 else:
--> 743 raise_with_traceback(retained)
744
745 ret = []
~\Anaconda3\lib\site-packages\pandas\compat\__init__.py in raise_with_traceback(exc, traceback)
342 if traceback == Ellipsis:
343 _, _, traceback = sys.exc_info()
--> 344 raise exc.with_traceback(traceback)
345 else:
346 # this version of raise is a syntax error in Python 3
ValueError: No tables found
Checking the HTML code there's actually a table tag on that url, and I do not understand why it does not pick it out...
Thanks a lot for your help.

creating numpy array from matrix with header

I'm stuck at creating a numpy array from a file. This is the code, where short_logs.txt is a data array with headers (no row names)
So matrix and code look like this:
a b c d
12 3 5 6
9 8 45 8
log_file = open('short_logs.txt')
samples = log_file.readline()
log_names = samples.split()
fields = zip(log_names, ['f8']*len(log_names))
fields_dtype = dtype(fields)
logs = loadtxt(log_file, dtype=fields_dtype)
But then I get this error and I don't understand why.
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
C:\IPython\utils\py3compat.pyc in execfile(fname, glob, loc)
195 else:
196 filename = fname
--> 197 exec compile(scripttext, filename, 'exec') in glob, loc
198 else:
199 def execfile(fname, *where):
c:\users\temp\tmp4iajle.py in <module>()
17
18
---> 19 logs = loadtxt(log_file, dtype=fields_dtype)
C:\Users\numpy\lib\npyio.pyc in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin)
848 items = [conv(val) for (conv, val) in zip(converters, vals)]
849 # Then pack it according to the dtype's nesting
--> 850 items = pack_items(items, packing)
851 X.append(items)
852 finally:
C:\Users\numpy\lib\npyio.pyc in pack_items(items, packing)
782 ret = []
783 for length, subpacking in packing:
--> 784 ret.append(pack_items(items[start:start+length], subpacking))
785 start += length
786 return tuple(ret)
C:\Users\numpy\lib\npyio.pyc in pack_items(items, packing)
773 """Pack items into nested lists based on re-packing info."""
774 if packing == None:
--> 775 return items[0]
776 elif packing is tuple:
777 return tuple(items)
IndexError: list index out of range`
Can someone point me in the right direction?
Thank you

XLDateAmbiguous workaround

Reading Excel files into Python often means tripping over the Excel leap year issue. This is described in many posts, but none offer a convenient solution. So this is what I'm asking here. With code such as:
import xlrd
from pandas import *
xlfile = 'test.xlsx'
wb = xlrd.open_workbook(xlfile)
sn = wb.sheet_names()
dfs = [read_excel(xlfile, x) for x in sn]
How could one avoid the resulting issue*:
---------------------------------------------------------------------------
XLDateAmbiguous Traceback (most recent call last)
<ipython-input-8-1db99305e2ac> in <module>()
1 sn = wb.sheet_names()
2
----> 3 dfs = [read_excel(xlfile, x) for x in sn]
/R/.virtualenv/pydata/lib/python2.7/site-packages/pandas/io/excel.pyc in read_excel(path_or_buf, sheetname, kind, **kwds)
50 """
51 return ExcelFile(path_or_buf,kind=kind).parse(sheetname=sheetname,
---> 52 kind=kind, **kwds)
53
54 class ExcelFile(object):
/R/.virtualenv/pydata/lib/python2.7/site-packages/pandas/io/excel.pyc in parse(self, sheetname, header, skiprows, skip_footer, index_col, parse_cols, parse_dates, date_parser, na_values, thousands, chunksize, **kwds)
138 chunksize=chunksize,
139 skip_footer=skip_footer,
--> 140 **kwds)
141
142 def _should_parse(self, i, parse_cols):
/R/.virtualenv/pydata/lib/python2.7/site-packages/pandas/io/excel.pyc in _parse_excel(self, sheetname, header, skiprows, skip_footer, index_col, has_index_names, parse_cols, parse_dates, date_parser, na_values, thousands, chunksize, **kwds)
194 if parse_cols is None or should_parse[j]:
195 if typ == XL_CELL_DATE:
--> 196 dt = xldate_as_tuple(value, datemode)
197 # how to produce this first case?
198 if dt[0] < datetime.MINYEAR: # pragma: no cover
/R/.virtualenv/pydata/lib/python2.7/site-packages/xlrd/xldate.pyc in xldate_as_tuple(xldate, datemode)
78
79 if xldays < 61 and datemode == 0:
---> 80 raise XLDateAmbiguous(xldate)
81
82 jdn = xldays + _JDN_delta[datemode]
XLDateAmbiguous: 1.0
* other than changing the date system manually in Excel prior to entering any data or searching/replacing 1/1/1900 with NAs...
I've had success with this:
#Set local time to dataframe index
dat['local_time']=pd.to_datetime(dat[local_time_column_name], format=date_format)
dat=dat.set_index('local_time')
dat=dat.tz_localize(timezone, ambiguous='infer')
Setting timezone-unknown date-time to the dataframe index, and then using the ambiguous='infer' flag.

Categories

Resources