Python - Pandas library returns wrong column values after parsing a CSV file - python

SOLVED Found the solution by myself. Turns out that when you want to retrieve specific columns by their names you should pass the names in the order they appear inside the csv (which is really stupid for a library that is intended to save some parsing time for a developer IMO). Correct me if I am wrong but i dont see a on option to get a specific columns values by its name if the columns are in a different order...
I am trying to read a comma separated value file with python and then
parse it using Pandas library. Since the file has many values (columns) that are not needed I make a list of the column names i do need.
Here's a look at the csv file format.
Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Attendance,Referee,HS,AS,HST,AST,HHW,AHW,HC,AC,HF,AF,HO,AO,HY,AY,HR,AR,HBP,ABP,GBH,GBD,GBA,IWH,IWD,IWA,LBH,LBD,LBA,SBH,SBD,SBA,WHH,WHD,WHA
E0,19/08/00,Charlton,Man City,4,0,H,2,0,H,20043,Rob
Harris,17,8,14,4,2,1,6,6,13,12,8,6,1,2,0,0,10,20,2,3,3.2,2.2,2.9,2.7,2.2,3.25,2.75,2.2,3.25,2.88,2.1,3.2,3.1
E0,19/08/00,Chelsea,West Ham,4,2,H,1,0,H,34914,Graham
Barber,17,12,10,5,1,0,7,7,19,14,2,3,1,2,0,0,10,20,1.47,3.4,5.2,1.6,3.2,4.2,1.5,3.4,6,1.5,3.6,6,1.44,3.6,6.5
E0,19/08/00,Coventry,Middlesbrough,1,3,A,1,1,D,20624,Barry
Knight,6,16,3,9,0,1,8,4,15,21,1,3,5,3,1,0,75,30,2.15,3,3,2.2,2.9,2.7,2.25,3.2,2.75,2.3,3.2,2.75,2.3,3.2,2.62
E0,19/08/00,Derby,Southampton,2,2,D,1,2,A,27223,Andy
D'Urso,6,13,4,6,0,0,5,8,11,13,0,2,1,1,0,0,10,10,2,3.1,3.2,1.8,3,3.5,2.2,3.25,2.75,2.05,3.2,3.2,2,3.2,3.2
E0,19/08/00,Leeds,Everton,2,0,H,2,0,H,40010,Dermot
Gallagher,17,12,8,6,0,0,6,4,21,20,6,1,1,3,0,0,10,30,1.65,3.3,4.3,1.55,3.3,4.5,1.55,3.5,5,1.57,3.6,5,1.61,3.5,4.5
E0,19/08/00,Leicester,Aston Villa,0,0,D,0,0,D,21455,Mike
Riley,5,5,4,3,0,0,5,4,12,12,1,4,2,3,0,0,20,30,2.15,3.1,2.9,2.3,2.9,2.5,2.35,3.2,2.6,2.25,3.25,2.75,2.4,3.25,2.5
E0,19/08/00,Liverpool,Bradford,1,0,H,0,0,D,44183,Paul
Durkin,16,3,10,2,0,0,6,1,8,8,5,0,1,1,0,0,10,10,1.25,4.1,7.2,1.25,4.3,8,1.35,4,8,1.36,4,8,1.33,4,8
This list is passed to pandas.read_csv()'s names parameter.
See code.
# Returns an array of the column names needed for our raw data table
def cols_to_extract():
cols_to_use = [None] * RawDataCols.COUNT
cols_to_use[RawDataCols.DATE] = 'Date'
cols_to_use[RawDataCols.HOME_TEAM] = 'HomeTeam'
cols_to_use[RawDataCols.AWAY_TEAM] = 'AwayTeam'
cols_to_use[RawDataCols.FTHG] = 'FTHG'
cols_to_use[RawDataCols.HG] = 'HG'
cols_to_use[RawDataCols.FTAG] = 'FTAG'
cols_to_use[RawDataCols.AG] = 'AG'
cols_to_use[RawDataCols.FTR] = 'FTR'
cols_to_use[RawDataCols.RES] = 'Res'
cols_to_use[RawDataCols.HTHG] = 'HTHG'
cols_to_use[RawDataCols.HTAG] = 'HTAG'
cols_to_use[RawDataCols.HTR] = 'HTR'
cols_to_use[RawDataCols.ATTENDANCE] = 'Attendance'
cols_to_use[RawDataCols.HS] = 'HS'
cols_to_use[RawDataCols.AS] = 'AS'
cols_to_use[RawDataCols.HST] = 'HST'
cols_to_use[RawDataCols.AST] = 'AST'
cols_to_use[RawDataCols.HHW] = 'HHW'
cols_to_use[RawDataCols.AHW] = 'AHW'
cols_to_use[RawDataCols.HC] = 'HC'
cols_to_use[RawDataCols.AC] = 'AC'
cols_to_use[RawDataCols.HF] = 'HF'
cols_to_use[RawDataCols.AF] = 'AF'
cols_to_use[RawDataCols.HFKC] = 'HFKC'
cols_to_use[RawDataCols.AFKC] = 'AFKC'
cols_to_use[RawDataCols.HO] = 'HO'
cols_to_use[RawDataCols.AO] = 'AO'
cols_to_use[RawDataCols.HY] = 'HY'
cols_to_use[RawDataCols.AY] = 'AY'
cols_to_use[RawDataCols.HR] = 'HR'
cols_to_use[RawDataCols.AR] = 'AR'
return cols_to_use
# Extracts raw data from the raw data csv and populates the raw match data table in the database
def extract_raw_data(csv):
# Clear the database table if it has any logs
# if MatchRawData.objects.count != 0:
# MatchRawData.objects.delete()
cols_to_use = cols_to_extract()
# Read and parse the csv file
parsed_csv = pd.read_csv(csv, delimiter=',', names=cols_to_use, header=0)
for col in cols_to_use:
values = parsed_csv[col].values
for val in values:
print(str(col) + ' --------> ' + str(val))
Where RawDataCols is an IntEnum.
class RawDataCols(IntEnum):
DATE = 0
HOME_TEAM = 1
AWAY_TEAM = 2
FTHG = 3
HG = 4
FTAG = 5
AG = 6
FTR = 7
RES = 8
...
The column names are obtained using it. That part of code works ok. The correct column name is obtained but after trying to get its values using
values = parsed_csv[col].values
pandas return the values of a wrong column. The wrong column index is around 13 indexes away from the one i am trying to get. What am i missing?

You can select column by name wise.Just use following line
values = parsed_csv[["Column Name","Column Name2"]]
Or you select Index wise by
cols = [1,2,3,4]
values = parsed_csv[parsed_csv.columns[cols]]

Related

Get data frame in shape of table in word document

I am reading an excel file, extracting a specific df and putting it in word document. The issues I face are:
DF loses its shape once added to para. becomes totally useless.
Complete code is written below.
#importing required libraries
import pandas as pd
import numpy as np
eod = pd.read_excel('df.xlsx')
import datetime
import docx
from datetime import date
legal = docx.Document('legal.docx')
#Calculating No. days from SCN
eod['SCN Days'] = (pd.Timestamp('now').floor('d') - eod['SCN Date']).dt.days
#Generation list of EFE for Final Showcause Notice to be issued today
FSCN_today = eod.where(eod['SCN Days']>20)
#Dropping Null from generated list
FSCN_today = FSCN_today.dropna(how ="all")
FSCN_today = FSCN_today[['Exporter Name','EFE','DESTINATION','VALUE']]
#Getting Unique Values in the list generated
s_values = FSCN_today['Exporter Name'].unique()
#Iterating through List
for c in s_values:
df1 = FSCN_today[FSCN_today['Exporter Name'] == c]
legal.paragraphs[7].text = c
legal.paragraphs[8].text = df1.iloc[10:1]
legal.paragraphs[15].text = str(df1)
notice_name = str(c)+ ".docx"
legal.save(notice_name)
#Update Date & Status of FSCN Issued today
eod['FSCN Date'] = np.where((eod['Status']=="SCN ISSUED") & (eod['SCN Days']>20),date.today(),eod['FSCN Date'])
eod['Status'] = np.where((eod['Status']=="SCN ISSUED") & (eod['SCN Days']>20),"FSCN ISSUED",eod['Status'])
#In progress
name = "EOD "+ str(date.today())+ ".xlsx"
#eod.to_excel(name,index =False)
Following line have error.
legal.paragraphs[15].text = str(df1)
You can make this work by creating a table, transfering the dataframe into that table (as explained in this post) and then placing that table right where legal.paragraphs[15] is at:
#importing required libraries
import pandas as pd
import numpy as np
eod = pd.read_excel('df.xlsx')
import datetime
import docx
from datetime import date
#Calculating No. days from SCN
eod['SCN Days'] = (pd.Timestamp('now').floor('d') - eod['SCN Date']).dt.days
#Generation list of EFE for Final Showcause Notice to be issued today
FSCN_today = eod.where(eod['SCN Days']>20)
#Dropping Null from generated list
FSCN_today = FSCN_today.dropna(how ="all")
FSCN_today = FSCN_today[['Exporter Name','EFE','DESTINATION','VALUE']]
#Getting Unique Values in the list generated
s_values = FSCN_today['Exporter Name'].unique()
#Iterating through List
for c in s_values:
legal = docx.Document('legal.docx')
df1 = FSCN_today[FSCN_today['Exporter Name'] == c]
legal.paragraphs[7].text = c
legal.paragraphs[8].text = df1.iloc[10:1].iloc
legal.paragraphs[15].text = ""
t = legal.add_table(df1.shape[0]+1, df1.shape[1])
for j in range(df1.shape[-1]):
t.cell(0,j).text = df1.columns[j]
for i in range(df1.shape[0]):
for j in range(df1.shape[-1]):
t.cell(i+1,j).text = str(df1.values[i,j])
legal.paragraphs[15]._p.addnext(t._tbl)
notice_name = str(c)+ ".docx"
legal.save(notice_name)
#Update Date & Status of FSCN Issued today
eod['FSCN Date'] = np.where((eod['Status']=="SCN ISSUED") & (eod['SCN Days']>20),date.today(),eod['FSCN Date'])
eod['Status'] = np.where((eod['Status']=="SCN ISSUED") & (eod['SCN Days']>20),"FSCN ISSUED",eod['Status'])
#In progress
name = "EOD "+ str(date.today())+ ".xlsx"
#eod.to_excel(name,index =False)
(I moved the legal = docx.Document('legal.docx') to the loop as consecutive docx's were keeping the older exporter values)
I noticed that legal.paragraphs[8].text = df1.iloc[10:1] looked odd.
If you change it to legal.paragraphs[8].text = df1[10:1].iloc the resulting .docx files look more reasonable to me.
I do not know what your desired output is, so that's my best guess with what I'm presented.
I've never worked with python-docx so I'm pretty sure my attempt is suboptimal. The following did work with the sample data.
Essentially, I've added a table to the document and inserted the column labels and content of the DataFrame into the table. There are some nasty parts that I couldn't solve otherwise (the parts where I access the _-attributes of paragraph and table).
I replaced the following part of your code above
#Iterating through List
for c in s_values:
df1 = FSCN_today[FSCN_today['Exporter Name'] == c]
legal.paragraphs[7].text = c
legal.paragraphs[8].text = df1.iloc[10:1]
legal.paragraphs[15].text = str(df1)
notice_name = str(c)+ ".docx"
legal.save(notice_name)
with this (comments for highlighting what I did, line breaks for better readability):
for c in s_values:
df1 = FSCN_today[FSCN_today['Exporter Name'] == c]
legal.paragraphs[7].text = c
legal.paragraphs[8].text = df1[10:1].iloc # <- Changed
# Add a table with the same amount of columns as the DataFrame
table = legal.add_table(0, len(df1.columns))
table.autofit = True
# Create the header line (= column labels of the DataFrame)
header = table.add_row()
for col, cell in enumerate(header.cells):
cell.text = str(df1.columns[col])
# Insert the content of DataFrame in the table
for ind in df1.index:
row = table.add_row()
for pos, col in enumerate(df1.columns):
row.cells[pos].text = df1.loc[ind, col]
# Add a break in paragraph 15 (before the table)
legal.paragraphs[15].add_run().add_break()
# Add the table to paragraph 15
legal.paragraphs[15]._p.addnext(table._tbl)
notice_name = str(c)+ ".docx"
legal.save(notice_name)
# Remove the table
table._element.getparent().remove(table._element)

Script keep showing "SettingCopyWithWarning'

Hello my problem is that my script keep showing below message
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
downcast=downcast
I Searched the google for a while regarding this, and it seems like my code is somehow
assigning sliced dataframe to new variable, which is problematic.
The problem is ** I can't find where my code get problematic **
I tried copy function, or seperated the nested functions, but it is not working
I attached my code below.
def case_sorting(file_get, col_get, methods_get, operator_get, value_get):
ops = {">": gt, "<": lt}
col_get = str(col_get)
value_get = int(value_get)
if methods_get is "|x|":
new_file = file_get[ops[operator_get](file_get[col_get], value_get)]
else:
new_file = file_get[ops[operator_get](file_get[col_get], np.percentile(file_get[col_get], value_get))]
return new_file
Basically what i was about to do was to make flask api that gets excel file as an input, and returns the csv file with some filtering. So I defined some functions first.
def get_brandlist(df_input, brand_input):
if brand_input == "default":
final_list = (pd.unique(df_input["브랜드"])).tolist()
else:
final_list = brand_input.split("/")
if '브랜드' in final_list:
final_list.remove('브랜드')
final_list = [x for x in final_list if str(x) != 'nan']
return final_list
Then I defined the main function
def select_bestitem(df_data, brand_name, col_name, methods, operator, value):
# // 2-1 // to remove unnecessary rows and columns with na values
df_data = df_data.dropna(axis=0 & 1, how='all')
df_data.fillna(method='pad', inplace=True)
# // 2-2 // iterate over all rows to find which row contains brand value
default_number = 0
for row in df_data.itertuples():
if '브랜드' in row:
df_data.columns = df_data.iloc[default_number, :]
break
else:
default_number = default_number + 1
# // 2-3 // create the list contains all the target brand names
brand_list = get_brandlist(df_input=df_data, brand_input=brand_name)
# // 2-4 // subset the target brand into another dataframe
df_data_refined = df_data[df_data.iloc[:, 1].isin(brand_list)]
# // 2-5 // split the dataframe based on the "brand name", and apply the input condition
df_per_brand = {}
df_per_brand_modified = {}
for brand_each in brand_list:
df_per_brand[brand_each] = df_data_refined[df_data_refined['브랜드'] == brand_each]
file = df_per_brand[brand_each].copy()
df_per_brand_modified[brand_each] = case_sorting(file_get=file, col_get=col_name, methods_get=methods,
operator_get=operator, value_get=value)
# // 2-6 // merge all the remaining dataframe
df_merged = pd.DataFrame()
for brand_each in brand_list:
df_merged = df_merged.append(df_per_brand_modified[brand_each], ignore_index=True)
final_df = df_merged.to_csv(index=False, sep=',', encoding='utf-8')
return final_df
And I am gonna import this function in my app.py later
I am quite new to all the coding, therefore really really sorry if my code is quite hard to understand, but I just really wanted to get rid of this annoying warning message. Thanks for help in advance :)

Python pandas empty df but columns has elements

I have really irritating thing in my script and don't have idea what's wrong. When I try to filter my dataframe and then add rows to newone which I want to export to excel this happen.
File exports as empty DF, also print shows me that "report" is empty but when I try to print report.Name, report.Value etc. I got normal and proper output with elements. Also I can only export one column to excel not entire DF which looks like empty.... What can cause that strange accident?
So this is my script:
df = pd.read_excel('testfile2.xlsx')
report = pd.DataFrame(columns=['Type','Name','Value'])
for index, row in df.iterrows():
if type(row[0]) == str:
type_name = row[0].split(" ")
if type_name[0] == 'const':
selected_index = index
report['Type'].loc[index] = type_name[1]
report['Name'].loc[index] = type_name[2]
report['Value'].loc[index] = row[1]
else:
for elements in type_name:
report['Value'].loc[selected_index] += " " + elements
elif type(row[0]) == float:
df = df.drop(index=index)
print(report) #output - Empty DataFrame
print(report.Name) output - over 500 elements
You are trying to manipulate a series that does not exist which leads to the described behaviour.
Doing what you did just with a way more simple example i get the same result:
report = pd.DataFrame(columns=['Type','Name','Value'])
report['Type'].loc[0] = "A"
report['Name'].loc[0] = "B"
report['Value'].loc[0] = "C"
print(report) #empty df
print(report.Name) # prints "B" in a series
Easy solution: Just add the whole row instead of the three single values:
report = pd.DataFrame(columns=['Type','Name','Value'])
report.loc[0] = ["A", "B", "C"]
or in your code:
report.loc[index] = [type_name[1], type_name[2], row[1]]
If you want to do it the same way you are doing it at the moment you first need to add an empty series with the given index to your DataFrame before you can manipulate it:
report.loc[index] = pd.Series([])
report['Type'].loc[index] = type_name[1]
report['Name'].loc[index] = type_name[2]
report['Value'].loc[index] = row[1]

Mapping values into two additional DataFrame columns by an existing one in Python

I am making a generic tool which can take up any csv file.The file contains a city column which needs to be geocoded to latitudes and Longitudes. I have a csv file which looks something like this. The first row is the column name and the second row is the type of variable.
Time,M1,M2,M3,CityName
temp,num,num,num,loc
20-May-13,19,20,0,delhi
20-May-13,25,42,7,agra
20-May-13,23,35,4,mumbai
20-May-13,21,32,3,delhi
20-May-13,17,27,1,mumbai
20-May-13,16,40,5,delhi
First of all, I find the unique values in the City column and form a list of it.
filename = 'data_file.csv'
data_date = pd.read_csv(filename)
column_name = data_date.ix[:, data_date.loc[0] == "city"]
column_work = column_name.iloc[1:]
column_unique = column_work.iloc[:,3].unique().tolist()
Secondly, I have written code for geocoding my cities.
def geocode(address):
i = 0
try:
while i < len(geocoders):
# try to geocode using a service
location = geocoders[i].geocode(address)
# if it returns a location
if location != None:
# return those values
return [location.latitude, location.longitude]
else:
# otherwise try the next one
i += 1
except:
print (sys.exc_info()[0])
return ['null','null']
# if all services have failed to geocode, return null values
return ['null','null']
list = ['delhi', 'agra', 'mumbai']
j = 0
lat = []
for row in list:
print ('processing #',j)
j+=1
try:
state = row
address = state
result = geocode(address)
# add the lat/lon values to the row
lat.extend(result)
except:
# print 'Unsuccessful'
to_print = 'Unsuccessful'
# row.extend(to_print)
dout.append(row)
print(lat)
This gives me a list of latitudes and longitudes [28.7040592, 77.10249019999999, 27.1766701, 78.00807449999999, 19.0759837, 72.8776559]. I want to write this onto my CSV file as
Time,M1,M2,M3,CityName,Latitude,Longitude
temp,num,num,num,loc,lat,lng
20-May-13,19,20,0,delhi,28.7040592,77.10249019999999
20-May-13,25,42,7,agra,27.1766701,78.00807449999999
20-May-13,23,35,4,mumbai,19.0759837, 72.8776559
20-May-13,21,32,3,delhi,28.7040592,77.10249019999999
20-May-13,17,27,1,mumbai,19.0759837, 72.8776559
20-May-13,16,40,5,delhi,28.7040592,77.10249019999999
I tried making a separate list of latitudes and longitudes latitude = lat[0::2] longitude = lat[1::2] or convert it to into a dictionary {'delhi': [28.7040592, 77.10249019999999], 'agra': [27.1766701, 78.00807449999999], 'mumbai': [19.0759837, 72.8776559]} but somehow could not figure out how to write it on a csv file.
I think converting them into a dictionary is a good approach.
dic = {'delhi': [28.7040592, 77.10249019999999],
'agra': [27.1766701, 78.00807449999999],
'mumbai': [19.0759837, 72.8776559]}
# Create new columns
data_date["Latitude"] = data_date.apply(lambda row: dic.get(row["CityName"])[0], axis = 1)
data_date["Longitude"] = data_date.apply(lambda row: dic.get(row["CityName"])[1], axis = 1)
# Write the data back to csv file
data_date.to_csv(filename, index = False)
In this way it gets values of corresponding city names from the dictionary, and write them into the specified column. Finally it overwrites the old csv file with the new data frame.

Iteratively add columns of various length to DataFrame

I have few categorical columns (description) in my DataFrame df_churn which i'd like to convert to numerical values. And of course I'd like to create a lookup table because i will need to convert them back eventually.
The problem is that every column has a different number of categories so appending to df_categories is not easy and I cant think of any simple way of do so.
Here is what I have so far. It stops after first column, because of the different length.
cat_clmn = ['CLI_REGION','CLI_PROVINCE','CLI_ORIGIN','cli_origin2','cli_origin3', 'ONE_PRD_TYPE_1']
df_categories = pd.DataFrame()
def categorizer(_clmn):
for clmn in cat_clmn:
dict_cat = {key: value for value, key in enumerate(df_churn[clmn].unique())}
df_categories[clmn] = dict_cat.values()
df_categories[clmn + '_key'] = dict_cat.keys()
df_churn[clmn + '_CAT'] = df_churn[clmn].map(dict_cat)
categorizer(cat_clmn)
There is a temporary solution, but I am sure it can be done in a better way.
df_CLI_REGION = pd.DataFrame()
df_CLI_PROVINCE = pd.DataFrame()
df_CLI_ORIGIN = pd.DataFrame()
df_cli_origin2 = pd.DataFrame()
df_cli_origin3 = pd.DataFrame()
df_ONE_PRD_TYPE_1 = pd.DataFrame()
cat_clmn = ['CLI_REGION','CLI_PROVINCE','CLI_ORIGIN','cli_origin2','cli_origin3', 'ONE_PRD_TYPE_1']
df_lst = [df_CLI_REGION,df_CLI_PROVINCE,df_CLI_ORIGIN,df_cli_origin2,df_cli_origin3, df_ONE_PRD_TYPE_1]
def categorizer(_clmn):
for clmn, df in zip(cat_clmn,df_lst):
d = {key: value for value, key in enumerate(df_churn[clmn].unique())}
df[clmn] = d.values()
df[clmn + '_key'] = d.keys()
df_churn[clmn + '_CAT'] = df_churn[clmn].map(d)
categorizer(cat_clmn)

Categories

Resources