I am trying to read a dbf file using simpledbf library and convert to to dataframe for further processing.
from simpledbf import Dbf5
dbf = Dbf5(r"C:\Users\Prashant.kumar\Downloads\dbf\F1_1.DBF")
df1 = dbf.to_dataframe()
Unfortunately, I am getting the following error.
I tried to find a solution but couldn't find a resolution, nor I can find an alternative way to convert the dbf file to a dataframe for post processing.
Here is the file
https://mega.nz/folder/gKIBUKIa#rE7TmE5FToLzCblMhLLFbw
Is there a way to read this dbf to python as a dataframe?
Use dbfread instead of simpledbf:
# pip install dbfread
from dbfread import DBF
from pandas import DataFrame
dbf = DBF('F1_1.DBF')
df = DataFrame(iter(dbf))
Output:
>>> df
RESPONDENT RESPONDEN2 RESPONDEN3 STATUS FORM_TYPE STATUS_DAT SORT_NAME PSWD_GEN _NullFlags
0 1 AEP Generating Company A 0 1990-01-01 b'\x00'
1 2 ALABAMA POWER COMPANY A 0 2000-05-03 b'\x00'
2 3 Alaska Electric Light and Power Company A 0 1990-01-01 b'\x00'
3 4 Alcoa Power Generating Inc. A 0 1990-01-01 b'\x00'
4 5 THE ALLEGHENY GENERATING COMPANY A 0 1990-01-01 b'\x00'
.. ... ... ... ... ... ... ... ... ...
389 538 DesertLink, LLC A -1 2020-11-17 b'\x00'
390 539 NextEra Energy Transmission MidAtlantic Indian... A -1 2020-12-03 b'\x00'
391 540 Wilderness Line Holdings, LLC A -1 2020-12-15 b'\x00'
392 541 McKenzie Electric Cooperative, Inc. A -1 2021-04-19 b'\x00'
393 542 LS Power Grid New York Corporation I A 0 2021-08-27 b'\x00'
[394 rows x 9 columns]
Related
I am trying to build a dataframe where the data is grabbed from multiple files. I have created an empty dataframe with the desired shape, but I am having trouble grabbing the data. I found this but when I concat, I am still getting NaN values.
Edit2: I changed the order of df creation and put concat inside the for loop and same result. (for obvious reasons)
import pandas as pd
import os
import glob
def daily_country_framer():
# create assignments
country_source = r"C:\Users\USER\PycharmProjects\Corona Stats\Country Series"
list_of_files = glob.glob(country_source + r"\*.csv")
latest_file = max(list_of_files, key=os.path.getctime)
last_frame = pd.read_csv(latest_file)
date_list = []
label_list = []
# build date_list values
for file in os.listdir(country_source):
file = file.replace('.csv', '')
date_list.append(file)
# build country_list values
for country in last_frame['Country']:
label_list.append(country)
# create dataframe for each file in folder
for filename in os.listdir(country_source):
filepath = os.path.join(country_source, filename)
if not os.path.isfile(filepath):
continue
df1 = pd.read_csv(filepath)
df = pd.DataFrame(index=label_list, columns=date_list)
df1 = pd.concat([df])
print(df1)
daily_country_framer()
Two sample dataframes: (notice the different shapes)
Country Confirmed Deaths Recovered
0 World 1595350 95455 353975
1 Afghanistan 484 15 32
2 Albania 409 23 165
3 Algeria 1666 235 347
4 Andorra 583 25 58
.. ... ... ... ...
180 Vietnam 255 0 128
181 West Bank and Gaza 263 1 44
182 Western Sahara 4 0 0
183 Zambia 39 1 24
184 Zimbabwe 11 3 0
[185 rows x 4 columns]
Country Confirmed Deaths Recovered
0 World 1691719 102525 376096
1 Afghanistan 521 15 32
2 Albania 416 23 182
3 Algeria 1761 256 405
4 Andorra 601 26 71
.. ... ... ... ...
181 West Bank and Gaza 267 2 45
182 Western Sahara 4 0 0
183 Yemen 1 0 0
184 Zambia 40 2 25
185 Zimbabwe 13 3 0
[186 rows x 4 columns]
Current output:
01-22-2020 01-23-2020 ... 04-09-2020 04-10-2020
World NaN NaN ... NaN NaN
Afghanistan NaN NaN ... NaN NaN
Albania NaN NaN ... NaN NaN
Algeria NaN NaN ... NaN NaN
Andorra NaN NaN ... NaN NaN
... ... ... ... ... ...
West Bank and Gaza NaN NaN ... NaN NaN
Western Sahara NaN NaN ... NaN NaN
Yemen NaN NaN ... NaN NaN
Zambia NaN NaN ... NaN NaN
Zimbabwe NaN NaN ... NaN NaN
[186 rows x 80 columns]
Desired output: (where NaN equals corresponding values from target column or a list of all columns ie: if ['Confirmed'] then 0,1,2,3,4, if all then [0,0,0],[1,0,0],[2,0,0])
Your code (with comments inline):
import pandas as pd
import os
import glob
def daily_country_framer():
# create assignments
country_source = r"C:\Users\USER\PycharmProjects\Corona Stats\Country Series"
list_of_files = glob.glob(country_source + r"\*.csv")
latest_file = max(list_of_files, key=os.path.getctime)
last_frame = pd.read_csv(latest_file)
date_list = []
label_list = []
# build date_list values
for file in os.listdir(country_source):
file = file.replace('.csv', '')
date_list.append(file)
# build country_list values
for country in last_frame['Country']: # == last_frame['Country'].tolist()
label_list.append(country)
# create dataframe for each file in folder
for filename in os.listdir(country_source):
filepath = os.path.join(country_source, filename)
if not os.path.isfile(filepath):
continue
df1 = pd.read_csv(filepath)
# you redefine df1 for every file in the loop. So if there
# are 10 files, only the last one is actually used anywhere
# outside this loop.
df = pd.DataFrame(index=label_list, columns=date_list)
df1 = pd.concat([df])
# here you just redefined df1 again as the concatenation of the
# empty dataframe you just created in the line above.
print(df1)
daily_country_framer()
So hopefully that illuminates why you were getting the results you were getting. It was doing exactly what you asked it to do.
What you want to do is get a dictionary with dates as keys and the associated dataframe as values, then concatenate that. This can be quite expensive because of some quirks with how pandas does concatenation, but if you concatenate along axis=0, you should be fine.
A better way might be the following:
import pandas as pd
import os
def daily_country_framer(country_source):
accumulator = {}
# build date_list values
for filename in os.listdir(country_source):
date = filename.replace('.csv', '')
filepath = os.path.join(country_source, filename)
accumulator[date] = pd.read_csv(filepath)
# now we have a dictionary of {date : data} -- perfect!
df = pd.concat(accumulator)
return df
daily_country_framer("C:\Users\USER\PycharmProjects\Corona Stats\Country Series")
Does that work?
I need to merge some data in dataframe because I will code [sequential association rule] in python.
How can I merge the data and what algorithm I should use in python?
Apriori? FP growth?
I can't find [sequential association rule] using apriori in python.
They use R
visit places are 250. unique id numbers are 116807 and total row is 1.7millions. and, each id has country_code(111 countries but I will classify them to 10 countries).. so I will merge them one more.
Previous Data
index date_ymd id visit_nm country
1 20170801 123123 seoul 460
2 20170801 123123 tokyo 460
3 20170801 124567 seoul 440
4 20170802 123123 osaka 460
5 20170802 123123 seoul 460
... ... ... ...
What I need
index Transaction visit_nm country
1 20170801123123 {seoul,tokyo} 460
2 20170802123123 {osaka,seoul} 460
From what i understood seeing the data, Use groupby agg:
s=pd.Series(df.date_ymd.astype(str)+df.id.astype(str),name='Transaction')
(df.groupby(s)
.agg({'visit_nm':lambda x: set(x),'country':'first'}).reset_index())
Transaction visit_nm country
0 20170801123123 {seoul, tokyo} 460
1 20170801124567 {seoul} 440
2 20170802123123 {osaka, seoul} 460
Also you could use:
df['Transaction'] = df['date_ymd'].map(str)+df['id'].map(str)
df.groupby('Transaction').agg({'visit_nm': lambda x: set(x), 'country': 'first'}).reset_index()
I am trying to read this small data file,
Link - https://drive.google.com/open?id=1nAS5mpxQLVQn9s_aAKvJt8tWPrP_DUiJ
I am using the code -
df = pd.read_table('/Data/123451_date.csv', sep=';', index_col=0, engine='python', error_bad_lines=False)
It has ';' as a seprator, and values are missing in the file for some columns values in some observations (or rows).
How can I read it properly. I see the current dataframe, which is not loaded properly.
It looks like the data you use has some garbage in it. Precisely, rows 1-33 (inclusive) have additional, unnecessary (non-GPS) information included. You can either fix the database by manually removing the unneeded information from the datasheet, or use following code snippet to skip the rows that include it:
from pandas import read_table
data = read_table('34_2017-02-06.gpx.csv', sep=';', skiprows=list(range(1, 34)).drop("Unnamed: 28", axis=1)
The drop("Unnamed: 28", axis=1) is simply there to remove an additional column that is created probably due to each row in your datasheet ending with a ; (because it reads the empty space at the end of each line as data).
The result of print(data.head()) is then as follows:
index cumdist ele ... esttotalpower lat lon
0 49 340 -34.8 ... 9 52.077362 5.114530
1 51 350 -34.8 ... 17 52.077468 5.114543
2 52 360 -35.0 ... -54 52.077521 5.114551
3 53 370 -35.0 ... -173 52.077603 5.114505
4 54 380 -34.8 ... 335 52.077677 5.114387
[5 rows x 28 columns]
To explain the role of the drop command even more, here is what would happen without it (notice the last, weird column)
index cumdist ele ... lat lon Unnamed: 28
0 49 340 -34.8 ... 52.077362 5.114530 NaN
1 51 350 -34.8 ... 52.077468 5.114543 NaN
2 52 360 -35.0 ... 52.077521 5.114551 NaN
3 53 370 -35.0 ... 52.077603 5.114505 NaN
4 54 380 -34.8 ... 52.077677 5.114387 NaN
[5 rows x 29 columns]
I have a quite big problem with reading xls. file to my machine learning project. Data that i need to extract is saved in .xls file and i can't find any option to easy extract to tensorflow dataset model, can anyone help?
link to this data:
"http://archive.ics.uci.edu/ml/machine-learning-databases/00192/BreastTissue.xls"
Try to use Pandas module:
import pandas as pd
In [24]: df = pd.read_excel(r'D:\download\BreastTissue.xls', sheet_name='Data')
In [25]: df
Out[25]:
Case # Class I0 PA500 HFS DA Area A/DA Max IP DR P
0 1 car 524.794072 0.187448 0.032114 228.800228 6843.598481 29.910803 60.204880 220.737212 556.828334
1 2 car 330.000000 0.226893 0.265290 121.154201 3163.239472 26.109202 69.717361 99.084964 400.225776
2 3 car 551.879287 0.232478 0.063530 264.804935 11888.391827 44.894903 77.793297 253.785300 656.769449
3 4 car 380.000000 0.240855 0.286234 137.640111 5402.171180 39.248524 88.758446 105.198568 493.701814
4 5 car 362.831266 0.200713 0.244346 124.912559 3290.462446 26.342127 69.389389 103.866552 424.796503
5 6 car 389.872978 0.150098 0.097738 118.625814 2475.557078 20.868620 49.757149 107.686164 429.385788
6 7 car 290.455141 0.144164 0.053058 74.635067 1189.545213 15.938154 35.703331 65.541324 330.267293
7 8 car 275.677393 0.153938 0.187797 91.527893 1756.234837 19.187974 39.305183 82.658682 331.588302
8 9 car 470.000000 0.213105 0.225497 184.590057 8185.360837 44.343455 84.482483 164.122511 603.315715
9 10 car 423.000000 0.219562 0.261799 172.371241 6108.106297 35.435762 79.056351 153.172903 558.274515
.. ... ... ... ... ... ... ... ... ... ... ...
96 97 adi 1650.000000 0.047647 0.043284 274.426177 5824.895192 21.225727 81.239571 262.125656 1603.070348
97 98 adi 2800.000000 0.083078 0.184307 583.259257 31388.652882 53.815953 298.582977 501.038494 2896.582483
98 99 adi 2329.840138 0.066148 0.353255 377.253368 25369.039925 67.246689 336.075165 171.387227 2686.435346
99 100 adi 2400.000000 0.084125 0.220610 596.041956 37939.255571 63.651988 261.348175 535.689409 2447.772353
100 101 adi 2000.000000 0.067195 0.124267 330.271646 15381.097687 46.571051 169.197983 283.639564 2063.073212
101 102 adi 2000.000000 0.106989 0.105418 520.222649 40087.920984 77.059161 204.090347 478.517223 2088.648870
102 103 adi 2600.000000 0.200538 0.208043 1063.441427 174480.476218 164.071543 418.687286 977.552367 2664.583623
103 104 adi 1600.000000 0.071908 -0.066323 436.943603 12655.342135 28.963331 103.732704 432.129749 1475.371534
104 105 adi 2300.000000 0.045029 0.136834 185.446044 5086.292497 27.427344 178.691742 49.593290 2480.592151
105 106 adi 2600.000000 0.069988 0.048869 745.474369 39845.773698 53.450226 154.122604 729.368395 2545.419744
[106 rows x 11 columns]
In [26]: df.dtypes
Out[26]:
Case # int64
Class object
I0 float64
PA500 float64
HFS float64
DA float64
Area float64
A/DA float64
Max IP float64
DR float64
P float64
dtype: object
In [27]: df.shape
Out[27]: (106, 11)
I have two dataframes, c and h below
c pickle file: http://s000.tinyupload.com/?file_id=64255815375060941529
h pickle file: http://s000.tinyupload.com/?file_id=98284988001290720556
When I write c.append(h) I get TypeError: data type not understood but ONLY if I run pandas 0.17.1. If I run this code in pandas 0.14.1 then the dataframes are appended correctly. What's going on and how can I modify my dataframes to append correctly in 0.17.1?
EDIT: Here are heads for the dataframes
In [49]: h.head(3)
Out[49]:
report_id adv_firm_key manager_id filing_manager_name \
0 45497 105129 20984 Bridgewater Associates, LP
1 45497 105129 20984 Bridgewater Associates, LP
2 45497 105129 20984 Bridgewater Associates, LP
report_period issuer_name cusip position_value quantity \
0 2015-12-31 ABBOTT LABS 002824100 1745000 38857
1 2015-12-31 ACCENTURE PLC IRELAND G1151C101 512000 4900
2 2015-12-31 ADOBE SYS INC 00724F101 9157000 97479
principal_type put_or_call sector total_holding_value \
0 SH X Health Care 7707722000
1 SH X Information Technology 7707722000
2 SH X Information Technology 7707722000
total_holding_value_calculated market_cap shares_float beta symbol \
0 7707722000 66993140300 1488070000 0.924138 ABT
1 7707722000 67773564900 626355000 0.985543 ACN
2 7707722000 46848347700 496787000 1.099186 ADBE
allocation portfolio_value
0 300000 2000000
1 300000 2000000
2 300000 2000000
In [50]: c.head(3)
Out[50]:
put_or_call position_value report_date fund_id report_period \
0 X 10000 2015-11-02 502 2015-12-31
1 X 10000 2015-11-02 502 2015-12-31
2 X 10000 2015-11-02 502 2015-12-31
underlying_id quantity side created_at report_id \
0 1001 5 Short 2016-03-16 17:31:57.003792+00:00 NaN
1 1001 5 Short 2016-03-16 17:31:57.003792+00:00 NaN
2 1001 5 Short 2016-03-16 17:31:57.003792+00:00 NaN
... adv_firm_key filing_manager_name symbol \
0 ... 155680 Davidson Kempner Capital Management LP AAOI
1 ... 155680 Davidson Kempner Capital Management LP AAOI
2 ... 155680 Davidson Kempner Capital Management LP AAOI
sector cusip issuer_name \
0 Telecommunication Services 03823U102 APPLIED OPTOELECTRONICS INC
1 Telecommunication Services 03823U102 APPLIED OPTOELECTRONICSINC COM
2 Telecommunication Services 03823U102 APPLIED OPTOELECTRONICS INC
principal_type market_cap shares_float beta
0 SH 288734200 14566500 1.45758
1 SH 288734200 14566500 1.45758
2 SH 288734200 14566500 1.45758
[3 rows x 21 columns]
Edit 2: Here is a stack trace
In [11]: pd.concat([c,h])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-11-943f474750e7> in <module>()
----> 1 pd.concat([c,h])
/usr/local/miniconda/envs/analytics-env/lib/python2.7/site-packages/pandas/tools/merge.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
833 verify_integrity=verify_integrity,
834 copy=copy)
--> 835 return op.get_result()
836
837
/usr/local/miniconda/envs/analytics-env/lib/python2.7/site-packages/pandas/tools/merge.py in get_result(self)
1023 new_data = concatenate_block_managers(
1024 mgrs_indexers, self.new_axes,
-> 1025 concat_axis=self.axis, copy=self.copy)
1026 if not self.copy:
1027 new_data._consolidate_inplace()
/usr/local/miniconda/envs/analytics-env/lib/python2.7/site-packages/pandas/core/internals.py in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy)
4472 copy=copy),
4473 placement=placement)
-> 4474 for placement, join_units in concat_plan]
4475
4476 return BlockManager(blocks, axes)
/usr/local/miniconda/envs/analytics-env/lib/python2.7/site-packages/pandas/core/internals.py in concatenate_join_units(join_units, concat_axis, copy)
4569 to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype,
4570 upcasted_na=upcasted_na)
-> 4571 for ju in join_units]
4572
4573 if len(to_concat) == 1:
/usr/local/miniconda/envs/analytics-env/lib/python2.7/site-packages/pandas/core/internals.py in get_reindexed_values(self, empty_dtype, upcasted_na)
4823 if self.is_null and not getattr(self.block, 'is_categorical',
4824 None):
-> 4825 missing_arr = np.empty(self.shape, dtype=empty_dtype)
4826 if np.prod(self.shape):
4827 # NumPy 1.6 workaround: this statement gets strange if all
TypeError: data type not understood
There is bug 11351 - not handled properly:
If you try add new column created_at, which is missing in h and concat:
h['created_at'] = np.nan
new = pd.concat([h,c])
get error:
AttributeError: 'numpy.ndarray' object has no attribute 'tz_localize'
One solution is convert Datetime to string:
c['created_at'] = c['created_at'].astype(str)
new = pd.concat([h,c])
new['created_at'] = pd.to_datetime(new['created_at'])