How can I do windowed query on multiple columns primary key? - python
Based on example found here but I guess I'm not understanding it. This works for single column primary keys but fails on multiple ones.
This is my code
#classmethod
def column_windows(cls, q, columns, windowsize, where = None):
"""Return a series of WHERE clauses against
a given column that break it into windows.
Result is an iterable of tuples, consisting of
((start, end), whereclause), where (start, end) are the ids.
Requires a database that supports window functions,
i.e. Postgresql, SQL Server, Oracle.
Enhance this yourself ! Add a "where" argument
so that windows of just a subset of rows can
be computed.
"""
#Here is the thing... how to compare...
def int_for_range(start_id, end_id):
if end_id:
return and_(
columns>=start_id,
columns<end_id
)
else:
return columns>=start_id
if isinstance(columns, Column):
columns_k=(columns,)
else:
columns_k=tuple(columns)
q2=None
cols=()
for c in columns:
cols = cols + (c,)
if not q2:
q2=q.session.query(c)
else:
q2=q2.add_column(c)
q2 = q2.add_column(func.row_number().over(order_by=columns_k).label('rownum'))
q2=q2.filter(q._criterion).from_self(cols)
if windowsize > 1:
q2 = q2.filter("rownum %% %d=1" % windowsize)
for res in q2:
print res
intervals = [id for id, in q2]
while intervals:
start = intervals.pop(0)
if intervals:
end = intervals[0]
else:
end = None
yield int_for_range(start, end)
#classmethod
def windowed_query(cls, q, columns, windowsize):
""""Break a Query into windows on a given column."""
for whereclause in cls.column_windows(q,columns, windowsize):
for row in q.filter(whereclause).order_by(columns):
yield row
Now I have the problem when comparing the set of columns of the primary key. Well I guess kind of recursive clause generating function should do it... Let's try it...
Well, result is not what expected but got it to work: Now it really windows any query keeping all in place, multi column unique ordering, and so on:
Here is my code, hope it may be usefull for someone else:
#classmethod
def window_query(cls, q, windowsize, windows=None):
"""
q=Query object we want to window results
windowsize=The number of elements each window has
windows=The window, or window list, numbers: 1-based to query
"""
windowselect=False
if windows:
if not isinstance(windows,list):
windows=list(windows)
windowselect=True
#Appending u_columns to ordered counting subquery will ensure unique ordering
u_columns=list([col for col in cls.getBestUniqueColumns()])
#o_columns is the list of order by columns for the query
o_columns=list([col for col in q._order_by])
#we append columns from u_columns not in o_columns to ensure unique ordering but keeping the desired one
sq_o_columns=list(o_columns)
for col in u_columns:
if not col in sq_o_columns:
sq_o_columns.append(col)
sub=None
#we select unique columns in subquery that we'll need to join in parent query
for col in u_columns:
if not sub:
sub=q.session.query(col)
else:
sub=sub.add_column(col)
#Generate a tuple from sq_o_columns list (I don't know why over() won't accept list itself TODO: more elegant
sq_o_col_tuple=()
for col in sq_o_columns:
sq_o_col_tuple=sq_o_col_tuple + (col,)
#we add row counting column, counting on generated combined ordering+unique columns tuple
sub = sub.add_column(func.row_number().over(order_by=sq_o_col_tuple).label('rownum')).filter(q._criterion)
#Prepare sub query to use as subquery (LOL)
sub=sub.subquery('lacrn')
#Prepare join ON clauses epxression comparing unique columns defined by u_columns
joinclause=expression.BooleanClauseList()
for col in u_columns:
joinclause=joinclause.__and__(col == sub.c[col.key])
#Make the joining
q=q.join(sub,joinclause
)
i=-1
while True:
#We try to query windows defined by windows list
if windowselect:
#We want selected-windows-results to returned
if windows:
i=windows.pop(0)-1
else:
break
else:
#We want all-windows-results to be returned
i=i+1
res=q.filter(and_(sub.c.rownum > (i*windowsize), sub.c.rownum <= ((i+1)*windowsize))).all()
if not (res or windowselect):
#We end an all-windows-results because of no more results, we must check if is selected-window-query
#because of selected-window-results may not exist and the are unordered
#EX: [1,2,9999999999999,3] : Assuming the third page required has no results it will return pages 1, 2, and 3
break
for row in res:
yield row
Related
if isinstance(x, list) appending new values from the list overwrites the previous one
I'm absolutely new to Python trying to automate some of my stuff. I'm currently trying to find a way to make a dictionary to a table based on columns from the table itself. cursor.execute(query) columns = [col[0] for col in cursor.description] rows = [dict(zip(columns, row)) for row in cursor.fetchall()] results = [] for row in rows: arg1 = row['arg1'] arg2 = row['arg2'] temp = row temp['concat'] = (str(arg1) + str(arg2)) concat = row['concat'] def map_sth(x): return { 'arg1arg2': ["abcd","efgh"], 'arg1arg2': "xyz", }[str(x)] mapped = map_sth(concat) if isinstance(mapped, list): for mapping in mapped: temp['new_column'] = mapping results.append(temp) else: temp['new_column'] = mapped results.append(temp) df = pandas.DataFrame(results) df.to_csv("file.csv",index=False) I debugged the code and it works fine for results with only one item in map_sth if isinstance(mapped, list): for mapping in mapped: temp['new_column'] = mapping results.append(temp) else: temp['new_column'] = mapped results.append(temp) One it gets into isinstance it gives me correct value for the first loop but once entered in the second loop it overwrites both values with the second phrase from map_sth Any help would be much appreciated as i'm currently stuck :/ Thanks!
python cut between partitioned column results
I use below code in Spark-scala to get the partitioned columns. scala> val part_cols= spark.sql(" describe extended work.quality_stat ").select("col_name").as[String].collect() part_cols: Array[String] = Array(x_bar, p1, p5, p50, p90, p95, p99, x_id, y_id, # Partition Information, # col_name, x_id, y_id, "", # Detailed Table Information, Database, Table, Owner, Created Time, Last Access, Created By, Type, Provider, Table Properties, Location, Serde Library, InputFormat, OutputFormat, Storage Properties, Partition Provider) scala> part_cols.takeWhile( x => x.length()!= 0 ).reverse.takeWhile( x => x != "# col_name" ) res20: Array[String] = Array(x_id, y_id) and I need to get similar output in Python. I'm struggling to replicate the same code in Python for the Array Operation to get the [y_id, x_id]. Below is what I tried. >>> part_cols=spark.sql(" describe extended work.quality_stat ").select("col_name").collect() Is it possible using Python.
part_cols in the question is an array of rows. So the first step is to convert it into an array of strings. part_cols = spark.sql(...).select("col_name").collect() part_cols = [row['col_name'] for row in part_cols] Now the start and end of the array's part that you are interessted in can be calculated with start_index = part_cols.index("# col_name") + 1 end_index = part_cols.index('', start_index) Finally a slice can be extracted from the list with these two values as start and end part_cols[start_index:end_index] This slice will contain the values ['x_id', 'y_id'] If the output really should be reversed, the slice part_cols[end_index-1:start_index-1:-1] will contain the values ['y_id', 'x_id']
Is it possible to update a row of data using position of column (e.g. like a list index) in Python / SQLAlchemy?
I am trying to compare two rows of data to one another which I have stored in a list. for x in range(0, len_data_row): if company_data[0][0][x] == company_data[1][0][x]: print ('MATCH 1: {} - {}'.format(x, company_data[0][0][x])) # do nothing if company_data[0][0][x] == None and company_data[1][0][x] != None: print ('MATCH 2: {} - {}'.format(x, company_data[1][0][x])) # update first company_id with data from 2nd if company_data[0][0][x] != None and company_data[1][0][x] == None: print ('MATCH 3: {} - {}'.format(x, company_data[0][0][x])) # update second company_id with data from 1st Psuedocode of what I want to do: If data at index[x] of a list is not None for row 2, but is blank for row 1, then write the value of row 2 at index[x] for row 1 data in my database. The part I can't figure out is if in SQLAlchemy you can do specify which column is being updated by an "index" (I think in db-land index means something different than what I mean. What I mean is like a list index, e.g., list[1]). And also if you can dynamically specify which column is being updated by passing a variable to the update code? Here's what I'm looking to do (it doesn't work of course): def some_name(column_by_index, column_value): u = table_name.update().where(table_name.c.id==row_id).values(column_by_index=column_value) db.execute(u) Thank you!
Counting the repeated values in one column base on other column
Using Panda, I am dealing with the following CSV data type: f,f,f,f,f,t,f,f,f,t,f,t,g,f,n,f,f,t,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,t,t,nowin t,f,f,f,f,f,f,f,f,f,t,f,g,f,b,f,f,t,f,f,f,f,f,t,f,t,f,f,f,f,f,f,f,t,f,n,won t,f,f,f,t,f,f,f,t,f,t,f,g,f,b,f,f,t,f,f,f,t,f,t,f,t,f,f,f,f,f,f,f,t,f,n,won f,f,f,f,f,f,f,f,f,f,t,f,g,f,b,f,f,t,f,f,f,f,f,t,f,t,f,f,f,f,f,f,f,t,f,n,nowin t,f,f,f,t,f,f,f,t,f,t,f,g,f,b,f,f,t,f,f,f,t,f,t,f,t,f,f,f,f,f,f,f,t,f,n,won f,f,f,f,f,f,f,f,f,f,t,f,g,f,b,f,f,t,f,f,f,f,f,t,f,t,f,f,f,f,f,f,f,t,f,n,win For this part of the raw data, I was trying to return something like: Column1_name -- t -- counts of nowin = 0 Column1_name -- t -- count of wins = 3 Column1_name -- f -- count of nowin = 2 Column1_name -- f -- count of win = 1 Based on this idea get dataframe row count based on conditions I was thinking in doing something like this: print(df[df.target == 'won'].count()) However, this would return always the same number of "wons" based on the last column without taking into consideration if this column it's a "f" or a "t". In other others, I was hoping to use something from Panda dataframe work that would produce the idea of a "group by" from SQL, grouping based on, for example, the 1st and last column. Should I keep pursing this idea of should I simply start using for loops? If you need, the rest of my code: import pandas as pd url = "https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.data" df = pd.read_csv(url,names=[ 'bkblk','bknwy','bkon8','bkona','bkspr','bkxbq','bkxcr','bkxwp','blxwp','bxqsq','cntxt','dsopp','dwipd', 'hdchk','katri','mulch','qxmsq','r2ar8','reskd','reskr','rimmx','rkxwp','rxmsq','simpl','skach','skewr', 'skrxp','spcop','stlmt','thrsk','wkcti','wkna8','wknck','wkovl','wkpos','wtoeg','target' ]) features = ['bkblk','bknwy','bkon8','bkona','bkspr','bkxbq','bkxcr','bkxwp','blxwp','bxqsq','cntxt','dsopp','dwipd', 'hdchk','katri','mulch','qxmsq','r2ar8','reskd','reskr','rimmx','rkxwp','rxmsq','simpl','skach','skewr', 'skrxp','spcop','stlmt','thrsk','wkcti','wkna8','wknck','wkovl','wkpos','wtoeg','target'] # number of lines #tot_of_records = np.size(my_data,0) #tot_of_records = np.unique(my_data[:,1]) #for item in my_data: # item[:,0] num_of_won=0 num_of_nowin=0 for item in df.target: if item == 'won': num_of_won = num_of_won + 1 else: num_of_nowin = num_of_nowin + 1 print(num_of_won) print(num_of_nowin) print(df[df.target == 'won'].count()) #print(df[:1]) #print(df.bkblk.to_string(index=False)) #print(df.target.unique()) #ini_entropy = (() + ())
This could work - outdf = df.apply(lambda x: pd.crosstab(index=df.target,columns=x).to_dict()) Basically we are going in on each feature column and making a crosstab with target column Hope this helps! :)
Google chart input data
I have a python script to build inputs for a Google chart. It correctly creates column headers and the correct number of rows, but repeats the data for the last row in every row. I tried explicitly setting the row indices rather than using a loop (which wouldn't work in practice, but should have worked in testing). It still gives me the same values for each entry. I also had it working when I had this code on the same page as the HTML user form. end1 = number of rows in the data table end2 = number of columns in the data table represented by a list of column headers viewData = data stored in database c = connections['default'].cursor() c.execute("SELECT * FROM {0}.\"{1}\"".format(analysis_schema, viewName)) viewData=c.fetchall() curDesc = c.description end1 = len(viewData) end2 = len(curDesc) Creates column headers: colOrder=[curDesc[2][0]] if activityOrCommodity=="activity": tableDescription={curDesc[2][0] : ("string", "Activity")} elif (activityOrCommodity == "commodity") or (activityOrCommodity == "aa_commodity"): tableDescription={curDesc[2][0] : ("string", "Commodity")} for i in range(3,end2 ): attValue = curDesc[i][0] tableDescription[curDesc[i][0]]= ("number", attValue) colOrder.append(curDesc[i][0]) Creates row data: data=[] values = {} for i in range(0,end1): for j in range(2, end2): if j == 2: values[curDesc[j][0]] = viewData[i][j].encode("utf-8") else: values[curDesc[j][0]] = viewData[i][j] data.append(values) dataTable = gviz_api.DataTable(tableDescription) dataTable.LoadData(data) return dataTable.ToJSon(columns_order=colOrder) An example javascript output: var dt = new google.visualization.DataTable({cols:[{id:'activity',label:'Activity',type:'string'},{id:'size',label:'size',type:'number'},{id:'compositeutility',label:'compositeutility',type:'number'}],rows:[{c:[{v:'AA26FedGovAccounts'},{v:49118957568.0},{v:1.94956132673}]},{c:[{v:'AA26FedGovAccounts'},{v:49118957568.0},{v:1.94956132673}]},{c:[{v:'AA26FedGovAccounts'},{v:49118957568.0},{v:1.94956132673}]},{c:[{v:'AA26FedGovAccounts'},{v:49118957568.0},{v:1.94956132673}]},{c:[{v:'AA26FedGovAccounts'},{v:49118957568.0},{v:1.94956132673}]}]}, 0.6);
it seems you're appending values to the data but your values are not being reset after each iteration... i assume this is not intended right? if so just move values inside the first for loop in your row setting code