iterate over pyspark dataframe columns - python

I have the following pyspark.dataframe:
age state name income
21 DC john 30-50K
NaN VA gerry 20-30K
I'm trying to achieve the equivalent of df.isnull().sum() (from pandas) which produces:
age 1
state 0
name 0
income 0
At first I tried something along the lines of:
null_counter = [df[c].isNotNull().count() for c in df.columns]
but this produces the following error:
TypeError: Column is not iterable
Similarly, this is how I'm currently iterating over columns to get the minimum value:
class BaseAnalyzer:
def __init__(self, report, struct):
self.report = report
self._struct = struct
self.name = struct.name
self.data_type = struct.dataType
self.min = None
self.max = None
def __repr__(self):
return '<Column: %s>' % self.name
class BaseReport:
def __init__(self, df):
self.df = df
self.columns_list = df.columns
self.columns = {f.name: BaseAnalyzer(self, f) for f in df.schema.fields}
def calculate_stats(self):
find_min = self.df.select([fn.min(self.df[c]).alias(c) for c in self.df.columns]).collect()
min_row = find_min[0]
for column, min_value in min_row.asDict().items():
self[column].min = min_value
def __getitem__(self, name):
return self.columns[name]
def __repr__(self):
return '<Report>'
report = BaseReport(df)
calc = report.calculate_stats()
for column in report1.columns.values():
if hasattr(column, 'min'):
print("{}:{}".format(column, column.min))
which allows me to 'iterate over the columns'
<Column: age>:1
<Column: name>: Alan
<Column: state>:ALASKA
<Column: income>:0-1k
I think this method has become way to complicated, how can I properly iterate over ALL columns to provide vaiour summary statistcs (min, max, isnull, notnull, etc..) The distinction between pyspark.sql.Row and pyspark.sql.Column seems strange coming from pandas.

Have you tried something like this:
names = df.schema.names
for name in names:
print(name + ': ' + df.where(df[name].isNull()).count())
You can see how this could be modified to put the information into a dictionary or some other more useful format.

you can try this one :
nullDf= df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
nullDf.show()
it will give you a list of columns with the number of null its null values.

Related

Python Qt6 QTableView - Iterate Column and Value

I have a QTableView object that is successfully displaying data as expected. When the user clicks on a cell it automatically selects the entire row without any problems. What I would like to do is display the column name and cell value for the selected row (e.g. iterate the row), which will feed into a new QTableView (basically want to transpose the selected row for easier viewing).
I've read through simpler examples and gone through the QT documentation but really struggling to get this working. I can retrieve the cell value but not the column name.
self.tblView = QTableView()
self.tblModel = TableModel(data)
self.tblModel.layoutChanged.emit()
self.tblView.setModel(self.tblModel)
self.tblView.selectRow(0)
self.tblView.clicked.connect(self.qTableViewSingleRowSelection)
def qTableViewSingleRowSelection(self, clickedIndex):
row = clickedIndex.row()
column = clickedIndex.column()
print("Selected Row: {0}".format(row))
print("Selected Column: {0}".format(column))
self.tblView.selectRow(row)
I haven't included the code that didn't work but all I want is to return a dictionary object that contains {'Column 1' : 'Value 1'} etc. for only the selected row. Seems so simple yet really struggling to get this working. Really appreciate any assistance.
You just need to add this
class YourClass():
...
self.dictionary = {}
...
def qTableViewSingleRowSelection(self, clickedIndex):
...
model = self.tblView.model()
value = model.data(model.index(row,column), Qt.DisplayRole)
header = model.headerData(column, Qt.Horizontal, Qt.DisplayRole)
self.dictionary[header] = value
Anyway, everytime you select an item of a column where you already selected an item, you will subscribe that item.
exemple:
Table:
| 1 | 2 | 3 |
| a | b | c |
| d | e | f |
If you select a and then you select e, your dictonary will be {1:'a',2:'e'}, but whe you select d, your dictonary will be {1:'d',2:'e'}
I'm not sure if this is the best approach but this is how I overcome the problem in the end.
class TableModel(QtCore.QAbstractTableModel):
def __init__(self, data = None):
super(TableModel, self).__init__()
self._data = data
...
def rowRecord(self, index):
column = str(self._data.columns[index.column()])
value = self._data.iloc[index.row(), index.column()]
return column, value
class Metadata(QWidget):
def __init__(self):
super().__init__()
...
def qTableViewSingleRowSelection(self, clickedIndex):
row = clickedIndex.row()
self.tblView.selectRow(row)
columns = self.tblModel.columnCount(None)
row_dictionary = {}
for columnIndex in range(columns):
column, value = self.tblModel.rowRecord(self.tblModel.index(row, columnIndex))
if(str(value).strip() != ""):
row_dictionary[column] = (value)
self.dataframe_record = pd.DataFrame.from_dict(row_dictionary,orient='index')

How do I delete repeated elements in a dataframe made by a class-method function?

I have a simple python code for data cleaning. This code imports data from Excel that has the format like this:
product cost used_by prime
name price gender yes or no
name price gender yes or no
... and so on
afterward I will get a mylist using the class function that looks something like this:
mylist = [Item(comic,20.0,male,yes),
Item(paint,14.0,male,no),
Item(pen,5.0,female,nan),
Item(phone case,9.0,nan,no),
Item(headphone,40.0,male,yes),
Item(coat,nan,male,no),
Item(comic,15.0,male,yes),
Item(nan,15.0,male,no)
... and so on]
and after all the filter and cleaning I will get a result that looks like this:
result = [Item(comic,20.0,male,yes),
Item(underwear,15.0,male,yes),
Item(comic,15.0,male,yes)
...
Item(underwear,15.0,male,yes),
...and so on]
Here is the code I got so far:
import os
import pandas as pd
import math
cwd = os.path.abspath('')
files = os.listdir(cwd)
df = pd.DataFrame()
for file in files:
if file.endswith('.XLSX'):
df = df.append(pd.read_excel(file), ignore_index=True)
df = df.where(df.notnull(), None)
array = df.values.tolist()
print(array)
class Item():
def has_all_properties(self):
return bool(self.__name and not math.isnan(self.__cost) and self.__gender and self.__prime)
def clean(self):
return bool(self.__name and self.__cost <=20 and self.__gender == "male" and self.__prime == "yes")
def __init__(self, name, cost, gender, prime):
self.__name = name
self.__cost = cost
self.__gender = gender
self.__prime = prime
def __repr__(self):
return f"Item({self.__name},{self.__cost},{self.__gender},{self.__prime})"
def __tuple__(self):
return self.__name, self.__cost, self.__gender, self.__prime
mylist = [Item(*k) for k in array]
filtered = filter(Item.has_all_properties, mylist)
clean = filter(Item.clean, filtered)
result = list(clean)
t_list = [obj.__tuple__() for obj in result]
print(t_list)
output = pd.DataFrame(t_list, columns =['name', 'cost', 'gender', 'prime'])
print(output)
output.to_excel('clean_data.xlsx', index = False, header = True)
In the result, there are two type of repetitive data, one is like the underwear which have two exact same lines; and the other one is like the comic, with different cost values.
So what I want to do is remove one of the line that are exact the same for case one and keep the line that has the smaller cost value for case 2.
So for case two, I am think of reading the product to identify if they are the same and if they are I then compare their cost and keep the one with smaller value. But I am not sure if that is the right way of doing it.
I am aware that using pandas all the way work but I wish to explore the class function and use the customized data frame.
Can you suggest how to do this?
You could use a Set instead of a list, i.e., changing myList = [Item(*k) for k in array] to mySet = {Item(*k) for k in array}.
Sets do not allow duplicates.
EDIT
Make sure to include implementations of __hash__ and __eq__ in your Item class so that set can know how to determine whether an item has a duplicate.
In your case, the __eq__ would look something like the following:
def __eq__(self, other):
self.__name == other.__name
self.__cost == other.__cost
self.__gender == other.__gender
self.__prime == other.__prime

How to return different type of objects overloading sum function in python?

I have a class Data and I want to overload the __add__ function and get different type of objects based on the type of the objects I sum.
Toy example code
The Data class is as follows:
class Data(object):
def __str__(self):
s = "Data type " + self.type + ": " + self.tag
return s
def __init__(self, tag=""):
self.type = ""
self.tag = tag
self.df = pd.Series()
def createRandomData(self, amountData=10, sizeData=100):
self.df = pd.DataFrame(np.random.randint(0,sizeData,size=(amountData, 1)))[0]
def __add__(self, other):
data = Data()
data = Data(self.tag + "+" + other.tag)
data.df = self.df + other.df
return data
I also have two different objects DataTypeA and DataTypeB which inherit from Data.
class DataTypeA(Data):
def __init__(self, tag=""):
Data.__init__(self, tag)
self.type = "A"
class DataTypeB(Data):
def __init__(self, tag=""):
Data.__init__(self, tag)
self.type = "B"
Expected result
So I would like to ensure that when I sum two objects from same type of Data (e.g.: DataTypeA+ DataTypeA) the result is also of that type of Data. But when two objects are of different types (e.g.: DataTypeA+ DataTypeB) the result should be a Data object.
The result for the sum should be the sum of the Series in df and the tag should be the tag of the first object concatenated with "+" and finally the second tag.
Example
By now if I run following code:
a = DataTypeA("data1")
a.createRandomData()
b = DataTypeB("data2")
b.createRandomData()
a2 = a+a
b2 = b+b
c = a+b
print a
print b
print a2
print b2
print c
I get as result:
>>>
Data type A: data1
Data type B: data2
Data type : data1+data1
Data type : data2+data2
Data type : data1+data2
But a2 (data1+data1) should be of type DataTypeA instead Data, and b2 (data2+data2) should be of type DataTypeB instead of Data.
Question
How to rewrite this code to achieve this and which would be the best way? Should I rewrite the __add__ method for Data object or overload __add__ method for DataTypeA and DataTypeB instead?
One solution would be to overload the Data object __add__ method as follows:
class Data(object):
def __str__(self):
s = "Data type " + self.type + ": " + self.tag
return s
def __init__(self, tag=""):
self.type = ""
self.tag = tag
self.df = pd.Series()
def createRandomData(self, amountData=10, sizeData=100):
self.df = pd.DataFrame(np.random.randint(0,sizeData,size=(amountData, 1)))[0]
def __add__(self, other):
data = Data()
if type(self) == type(other):
if type(self) == DataTypeA:
data = DataTypeA(self.tag + "+" + other.tag)
data.df = self.df + other.df
elif type(self) == DataTypeB:
data = DataTypeB(self.tag + "+" + other.tag)
data.df = self.df + other.df
else:
data = Data(self.tag + "+" + other.tag)
data.df = self.df + other.df
return data
The rest of the code would remain the same.
So if you run the same example code mentioned in the question the output would be:
>>>
Data type A: data1
Data type B: data2
Data type A: data1+data1
Data type B: data2+data2
Data type : data1+data2

Separate row of a dataframe based on a condition by renaming it in pandas / python

For each of my id column, I want to check for each row the column job and rename the id rows below if a specific job appears.
In this example below, if the job seller appears or barber, the rows below must be renamed.
Do you have a clue ?
entry = pd.DataFrame([['1','35','fireman'],['2','35','policeman'],['3','35','seller'],['4','35','seller'],['5','35','accountant'],['6','35','barber'],['7','35','fireman']],columns=['index','id','job'])
output = pd.DataFrame([['1','35','fireman'],['2','35','policeman'],['3','35','seller'],['4','35','seller'],['5','35(1)','accountant'],['6','35(1)','barber'],['7','35(2)','fireman']],columns=['index','id','job'])
Okay, this is the new answer.
class id_numbering:
def __init__(self):
self.number = 0
def add_one(self):
self.number += 1
return self.number
def add_zero(self):
return self.number
def new_id(x, number, job_to_match):
job = x['job']
id_ = x['id']
is_match_job = x['match_job']
def id_string(id_, number):
if number == 0:
return id_
else:
return "{id_}({number})".format(id_=id_, number=number)
if (is_match_job is True) and (job not in job_to_match):
number = number.add_one()
return id_string(id_, number)
else:
number = number.add_zero()
return id_string(id_, number)
job_to_match = ["seller", "barber"]
entry['match_job'] = entry['job'].map(lambda x: True if x in job_to_match else False)
entry['match_job'] = entry['match_job'].shift(1)
number = id_numbering()
entry["new_id"] = entry.apply(lambda x: new_id(x, number, job_to_match), axis=1)
output = entry.drop("match_job", axis=1)

tables in python - having some errors - not sure why

I have some questions about "technical" and basic functions in python.
I have a table like this:
Name,Gender,Age,Salary,Height
Menny, M, 1, 1, 1
James, J, 2, 2, 2
Sami, S, 3, 3, 3
class Table:
def __init__(self,path,sep):
try:
f = open(path, "r")
read_file = f.read()
f.close()
except:
print "cannot create a table from this file"
return
table = read_file.split("\n")
for i in range (len(table)):
table[i] = table[i].split(sep)
if len(table) > 0:
for i in range(len(table[0])):
if table[0][i] in table[0][0:i]:
raise ValueError
row_names = []
for i in range(1,len(table)):
if len(table[i]) != len(table[0]):
raise ValueError
if table[i][0] in row_names:
raise ValueError
row_names.append(table[i][0])
Now I want to use functions:
1. to know how many cells there are. here I have 12 cells. The height of the table is len(table). Then the width is len(table[0]). The number of cells is height*width.
so:
def len(self):
height = len(table)
width = len(table[0])
return height * width
and if I tried this:
def len(self):
len(self.nestedList)*len(self.nestedList[0])
I get "None"
If in the shell I write the name Menny, Sami etc, then print the rest of the line (age, salary etc)....
So I thought about it:
def the_row (self, rowname):
rows_checking = []
for i in range(1, len(table)):
rows_checking.append(table[i])
if rowname in rows_checking:
table[i].remove(table[0:0])
return table[i]
almost the same thing like in the second task, but this time the function will print the value that is common to 2 thing. For example - the code will print "1" if I write Menny and Age.
Again, I think I'll do it almost the same as I did in the pre task, but this time:
get_the_value(self,rowname,colname)
So far seems to be good ideas, I hope so...
but I get errors:
AttributeError: Table instance has no attribute '__len__'
or
AttributeError: Table instance has no attribute 'len'
Probably because I didn't used "self" here, right? So what I can do?
You don't have to feed me by the spoon and tell me the codes as it should be, but just give me advices as possible as you can, please.
edited code:
class Table:
def __init__(self,path,sep):
self.path=path
self.sep=sep
self.g=[]
self.count=0
self.headlines=[]
self.matrix=[]
self.headrows=[]
self.postionrow=0
self.postioncolmn=0
try:
f=open(self.path,'r')
read_file=f.read()
split_file=read_file.split()
for line in split_file:
list_the_line=line.split(self.sep)
self.g.append(list_the_line)
self.count=0
for z in range (len(self.g[0])):
self.count=0
for d in range(len(self.g[0])):
if self.g[0][z]==self.g[0][d]:
self.count+=1
if self.count>=2:
raise ValueError
num_first_line=len(self.g[0])
for k in range (len(self.g)):
if len(self.g[k])!= num_first_line:
raise ValueError
self.headlines=self.g[0]
self.g.remove(self.g[0])
self.count=0
for row_name1 in range (len(self.g)):
self.count=0
for row_name2 in range(len(self.g)):
if self.g[row_name1][0]==self.g[row_name2][0]:
self.count+=1
if self.count>=2:
raise ValueError
for i in range (len(self.g)):
self.headrows.append(self.g[i][0])
self.g[i].remove(self.g[i][0])
ezer=[]
for op in range (len(self.g)):
ezer=[]
for od in range (len(self.g[0])):
ezer.append(self.g[od][op])
self.matrix.append(ezer)
f.close()
except :
print "cannot creat a table object from this file"
return
def len(self):
num_rows=len(self.g)
num_cols=len(self.g[0])
return num_rows*num_cols
def get_row(self,rowname):
for i in range (len(self.headlines)):
if rowname==self.headrows[i]:
self.postionrow=i
return self.g[i]
if not rowname in self.headrows :
raise ValueError
def get_column(self,colname):
for i in range (len(self.headlines)):
if colname==self.headlines[i]:
self.postioncolmn=i-1
return self.matrix[i-1]
if not colname in self.headlines :
raise ValueError
def get_value(self,rowname,colname):
self.get_row(rowname)
self.get_column(colname)
if not rowname in self.headrows :
raise ValueError
if not colname in self.headlines :
raise ValueError
return self.g[self.postionrow][self.postioncolmn]
def get_row_name_with_max_value(self,colname):
if not colname in self.headlines :
raise ValueError
max_colmn=max(self.get_column(colname))
for i in range (len(self.matrix)):
if max_colmn == self.g[i][self.postioncolmn]:
return self.headrows[i]
and what should be the result:
>>> table = Table("table_examp1111111","\t")
cannot create a table from this file
>>> table = Table("table_example1.txt","\t")
>>> print table.len()
12
>>> print table.get_row("Menny")
['M', '1', '1', '1']
>>> print table.get_column("Height")
['1', '2', '3']
>>> print table.get_value("Sami","Age")
3
>>> print table.get_row_name_with_max_value("Height")
Sami
>>> print table.get_row_name_with_max_value("Salary")
Sami
This code works but I want to make it more pythonic. Please don't change the form, don't add or remove function just fix my syntex.
Thanks.
Whenever you call the function len() on an object. It will try to call the __ len__ function of that object. So if you do that it might work.
def __len__(self):
height = len(self.table)
width = len(self.table[0])
return height * width
you are tying to call __len__ on the Table class, while you look like you should be calling it on your table string array in the constructor.
You should create an attribute self.table, and then either use the len function on that, or
def numOfCells(self):
return len(self.table) * len(self.table[0])
This looks like a perfect place to use the csv module:
import csv
def load_csv(fname, **kwargs):
with open(fname, 'rb') as inf:
in_csv = csv.reader(inf, **kwargs)
return list(in_csv)
class Table:
def __init__(self, path, sep=','):
self.table = load_csv(path, delimiter=sep)
if len(self.table) == 0:
raise ValueError('No data in file {}'.format(path))
self.header = self.table.pop(0)
self.cols = len(self.header)
self.labels = {}
for i, row in enumerate(self.table, 1):
# make sure rows are all same length
if len(row) != self.cols:
raise ValueError('row {} contains {} items - should be {}'.format(i, len(row), self.cols))
# make sure rows-labels are unique
lbl = row[0]
if lbl in self.labels:
raise ValueError('rows {} and {} - duplicate labels'.format(self.labels[lbl], i))
else:
self.labels[lbl] = i - 1
#property
def rows(self):
return len(self.table)
#property
def cells(self):
return self.rows * (self.cols - 1) # omit row labels
def get_row_by_label(self, lbl):
return self.table[self.labels[lbl]]
def get_item(self, lbl, attr):
ndx = self.header.index(attr)
return self.get_row_by_label(lbl)[ndx]
def main():
t = Table('tbl.csv')
print(t.cells)
print(t.get_row_by_label("Menny"))
print(t.get_item("Menny", "Age"))
if __name__=="__main__":
main()
EDIT:
Ok, this is for your FIRST question. From what I understand, you are wanting a function that will return the number of cells in your table. This number does not include the names of people in the rows, and does not include the first row at all. If I understand correctly, then this should work:
If table is:
Name,Gender,Age,Salary,Height
Menny, M, 1, 1, 1
James, J, 2, 2, 2
Sami, S, 3, 3, 3
Then number of cells is '12'... so:
Example:
class Table:
def __init__(self, path, sep):
try:
with open(path) as f:
read_file = f.read() # will auto close the file after the read
except:
print "cannot create a table from this file"
return
self.table = read_file.split('\n') # self.table will make this easier
self.table = [self.table[i].split(sep) for i in range(len(self.table))] # does the same as your for loop
if len(self.table) > 0:
for i in range(len(self.table[0])):
if self.table[0][i] in self.table[0][0:i]:
raise ValueError
row_names = []
for i in range(1,len(self.table)):
if len(self.table[i]) != len(self.table[0]):
raise ValueError
if self.table[i][0] in row_names:
raise ValueError
row_names.append(self.table[i][0])
# now a function that could return the table length
def get_num_cells(self):
# essentially you sum each item in row[1:] for table[1:]
return sum((sum(1 for i in range(1, len(self.table[0])))) for i in range(1,len(self.table)))
Using self.table will make this easier, as you don't have to include it in the other function args, as above in get_num_cells, I just used self.table without putting it in the args of the function.
To call this function you would do the following:
app = Table(path, sep)
app.get_num_cells()
# this code would be placed at the end of your file, and not inside the class
Example:
class Table()
__init__(self, path, sep):
etc.
etc.etc.etc.
# now the code to create an instance of Table and call a method here like this
app = Table(path, sep) # path would be a filepath "C:/etc./file.txt", and sep = "," etc.
app.get_num_cells()
For your other questions, I am not entirely sure what you want yet, but if you write again in the comments for this, I will try. Please let me know if this works for you.

Categories

Resources