Attempting to find Cooccupancy using a custom python script - python

This script was created by an ex-lab member that was quite a bit more adapt at Python scripting than I am.
I am attempting to find Cooccupancy between annotated peaks in "exon" regions of the entire human h19 genome. However, after trying to get this to run for about an hour I am looking for help.
Here is the script:
#!/usr/bin/python
import math
import sys
import re
import csv
import MySQLdb
import itertools
import argparse
# format for execution: ./findCooccupancy.py <loci file> <comma separated list of marks to check> <window size> <outputfile>
# example: ./findCooccupancy.py AllGenes.txt PolII-ChIP,KAP1-ChIP,Hexim 150 output.txt
# format of loci file:
# chr2 12345678 12345900 GENEA 1 +
# chr4 987654321 98765000 GENEB 1 -
# etc...
locifile = sys.argv[1]
marks = sys.argv[2]
window = int(sys.argv[3])
outputfile = sys.argv[4]
loci = list(csv.reader(open(locifile, 'rb'),delimiter='\t'))
#loci = list(itertools.chain.from_iterable(loci))
db = MySQLdb.connect(host="localhost",user="snrnp",passwd="snrnp",db="snrnp")
cur = db.cursor()
cntdict = {}
for mark in marks.split(","):
cntdict[mark] = []
counter = 1
for locus in loci:
print "Working on line# " + str(counter)
counter += 1
if str(locus[5]) == "+":
exon = locus[1]
else:
exon = locus[2]
for mark in marks.split(","):
# this is incredibly dirty. sorry. I don't have time to do this better
if mark == 'PolII-ChIP':
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")")
#print "select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")"
else:
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and ((chr_start < " + str(exon) + " and chr_end > " + str(exon) + ") or (abs(chr_start - " + str(exon) + ") < " + str(window) + ") or (abs(chr_end - " + str(exon) + ") < " + str(window) + "))")
#print "select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and ((chr_start < " + str(exon) + " and chr_end > " + str(exon) + ") or (abs(chr_start - " + str(exon) + ") < " + str(window) + ") or (abs(chr_end - " + str(exon) + ") < " + str(window) + "))"
cnt = cur.fetchone()[0]
if cnt > 0:
cntdict[mark].append(",".join(locus))
convertedlist = []
for key in cntdict.keys():
convertedlist.append(cntdict[key])
intersectlist = set(convertedlist[0]).intersection(*convertedlist[1:])
for key in cntdict.keys():
print str(key) + " hits: " + str(len(cntdict[key]))
print "\nTotal Intersection Count: " + str(len(intersectlist))
with open(outputfile, 'w') as outputwriter:
for line in intersectlist:
outputwriter.write(line + "\n")
This is the command line that I have been using:
./findCooccupancy.py ~/code/snRNP/analysis/from\ sequencing/KEC_Project/Pol-IIAnnotatedPeaksGenome.txt PolII-ChIP 150 KECExonOccupancy.txt
This is the latest error message I have received:
Working on line# 1
Traceback (most recent call last):
File "./findCooccupancy.py", line 41, in <module>
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")")
File "/Library/Python/2.7/site-packages/MySQLdb/cursors.py", line 205, in execute
self.errorhandler(self, exc, value)
File "/Library/Python/2.7/site-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
_mysql_exceptions.OperationalError: (1054, "Unknown column 'Start' in 'where clause'")

Related

Below piece of code is not deleting records but the records are deleted from the query when run manually

def complete_stage_purge_process(self, target_cnxn, stage_table, process_cd):
self.logger.debug(datetime.now())
self.logger.debug('complete_stage_purge_process')
delete_dt = datetime.today() - timedelta(days=30)
delete_dt = str(delete_dt)
run_pk_sql = "select run_pk from " + schemaName.PROCESS.value + "." + tableName.RUN_LOG.value + " where " + ProcessRunlog.ETL_MODIFIED_DTM.value + " <= '" + delete_dt + "' and " + \
ProcessRunlog.PROCESS_PK.value + " = (select " + ProcessRunlog.PROCESS_PK.value + " from " + schemaName.PROCESS.value + "." + \
tableName.PROCESS.value + " where " + \
Process.PROCESS_CODE.value + " = '" + process_cd + "') "
delete_sql = "delete from " + schemaName.STAGE.value + "." + stage_table + " where run_pk in (" + run_pk_sql + ")"
print(delete_sql)
print(target_cnxn)
try:
trgt_cursor = target_cnxn.cursor()
trgt_cursor.execute(delete_sql)
self.logger.debug("deletes processed successfully ")
except:
self.logger.exception('Error in processing deletes')
raise
But when added commit after trgt_cursor.execute(delete_sql) then below error is thrown. Could someone please help on how to handle this
AttributeError: 'psycopg2.extensions.cursor' object has no attribute 'commit'

Getting the subheaders in openpyxl

I'm currently using openpyxl to use an excel file that has mul-indices (2 Levels of headers) and i'm trying to do the operations depending on the subheaders a header has.
I have some exp. doing this in pandas but for this Project i have to use openpyxl which i barly made any use of before.
Only thing i could think off is the manual way:
iterating over the rows
saving the first row as header and 2nd row as subheader
do some cleaning.
manually save the headers with their subheaders in dics. then filleing in the values by iterating over all the cols
my code is as follows:
#reading the excel file
path = r'path to file'
wb = load_workbook(path) #loading the excel table
ws = wb.active #grab the active worksheet
#Setting the doc Header
for h in ws.iter_rows(max_row = 1, values_only = True): #getting the first row (Headers) in the table
header = list(h)
for sh in ws.iter_rows(min_row = 1 ,max_row = 2, values_only = True):
sub_header = list(sh)
#removing all of the none Values
header = list(filter(None, header))
sub_header = list(filter(None, sub_header))
print(header)
print(sub_header)
#creating a list of all the Columns in the excel file
col_list = []
for col in ws.iter_cols(min_row=3,min_col = 1): #Iteration over every single row starting from the third row since first two are the headers
col = [cell.value for cell in col] #Creating a list from each row
col = list(filter(None, col)) #removing the none values from each row
col_list.append(col) #creating a list of all rows (starting from the 3d one)
#print (col_list)
But i'm sure there must be a better way that i wasnt able to find in the docs or by checking this website.
Thanks in advance!
My goal in the end is to automate this part of my code by iterating over the header and use the subheaders of that head and their values each time
code:
#bulding the templates using yattag "yattag.org"
doc , tag , text = Doc().tagtext()
#building the tags of the xml file
with tag("Data"): #root tag
for row in row_list :
with tag("Row"):
with tag("Input"):
with tag(header[0].replace(' ','_').replace('\n','_')):
text("In " + dic[row[0]]+" the precentage of Students " + " regarding the " + header[0] + " the Precentage of Students with "+ sub_header[0] + " is "+ str(row[1]) + " whereas the " + sub_header[1] + " are " + str(row[2]) )
with tag("Row_Data"):
text(dic[row[0]] + " | " + header[0] + " | " + sub_header[0]+ " | " + str(row[1]) + " | " + sub_header[1] + " | " + str(row[2]))
with tag(header[1].replace(' ','_').replace('\n','_')):
text("In " + dic[row[0]]+" the precentage of Students " + " regarding the " + header[1] + " the Precentage of Students with "+ sub_header[2] + " is "+ str(row[3]) + " whereas the " + sub_header[3] + " are " + str(row[4]) )
with tag("Row_Data"):
text(dic[row[0]] + " | " + header[1] + " | " + sub_header[2]+ " | " + str(row[3]) + " | " + sub_header[3] + " | " + str(row[4]))
with tag(header[2].replace(' ','_').replace('\n','_')):
text("In " + dic[row[0]]+" the precentage of Students " + " regarding the " + header[2] + " the Precentage of Students with "+ sub_header[4] + " is "+ str(row[5]) + " whereas the " + sub_header[5] + " are " + str(row[6]) )
with tag("Row_Data"):
text(dic[row[0]] + " | " + header[2] + " | " + sub_header[4]+ " | " + str(row[5]) + " | " + sub_header[5] + " | " + str(row[6]))
with tag(header[3].replace(' ','_').replace('\n','_')):
text("In " + dic[row[0]]+" the precentage of Students " + " regarding the " + header[3] + " the Precentage of Students with "+ sub_header[6] + " is "+ str(row[7]) + " whereas the " + sub_header[7] + " are " + str(row[8]) +" and for " + sub_header[8] + str(row[9]) )
with tag("Row_Data"):
text(dic[row[0]] + " | " + header[3] + " | " + sub_header[6]+ " | " + str(row[7]) + " | " + sub_header[7] + " | " + str(row[8]) + " | " + sub_header[8] + " | " + str(row[9]))
with tag(header[4].replace(' ','_').replace('\n','_')):
text("In " + dic[row[0]]+" the precentage of Students " + " regarding the " + header[4] + " the Precentage of Students with "+ sub_header[9] + " is "+ str(row[10]) + " whereas the " + sub_header[10] + " are " + str(row[11]) )
with tag("Row_Data"):
text(dic[row[0]] + " | " + header[4] + " | " + sub_header[9]+ " | " + str(row[10]) + " | " + sub_header[10] + " | " + str(row[11]))
with tag(header[5].replace(' ','_').replace('\n','_')):
text("In " + dic[row[0]]+" the precentage of Students " + " regarding the " + header[5] + " the Precentage of Students with "+ sub_header[11] + " is "+ str(row[12]) + " whereas the " + sub_header[12] + " are " + str(row[13]) )
with tag("Row_Data"):
text(dic[row[0]] + " | " + header[5] + " | " + sub_header[11]+ " | " + str(row[12]) + " | " + sub_header[12] + " | " + str(row[13]))
with tag(header[6].replace(' ','_').replace('\n','_')):
text("In " + dic[row[0]]+" the precentage of Students " + " regarding the " + header[6] + " the Precentage of Students with "+ sub_header[13] + " is "+ str(row[14]) + " whereas the " + sub_header[14] + " are " + str(row[15]) )
with tag("Row_Data"):
text(dic[row[0]] + " | " + header[6] + " | " + sub_header[13]+ " | " + str(row[14]) + " | " + sub_header[14] + " | " + str(row[15]))
#print(doc.getvalue())
result = indent(
doc.getvalue(),
indentation=' ',
indent_text=True
)
#saving the xml file
with open("output.xml", "w") as f:
f.write(result)
Unless Pandas is completely off the table I think you might be able to do something pandas and openpyxl. The documentation mentions reading data from openpyxl into a pandas dataframe: Working with Pandas and Numpy.
Could you use:
data = ws.values
df = DataFrame(data[2:,:], index=data[0], columns=data[1])
There may be some filtering necessary with regards to the None values.

Syntax error in postgresql query coded in python

I am executing PostgreSQL13 queries coding them in python 3.9 using the psycopg2 library. I also am working with PostGIS extension over PostgreSQL.
Kindly look for the comment which points out the line which causes the syntax error. I am having trouble both understanding what is the syntax error and how to debug it since I need to execute PostgreSQL queries using python so any tips will be greatly appreciated.
def corefunc(rf, openConnection):
pcur = openConnection.cursor(name="pcur" + rf)
rcur = openConnection.cursor(name="rcur" + rf)
acur = openConnection.cursor()
rcur.execute("SELECT geom FROM " + rf)
for number in range (1, 5):
acur.execute("DROP TABLE IF EXISTS " + "pf"+ rf)
acur.execute("CREATE TABLE " + "pf" + rf + " (index integer, sums integer)")
pcur.execute("SELECT geom FROM " + "pf" + str(number))
row = 1
for each in rcur.fetchall():
if number == 1: acur.execute("INSERT INTO " + "pf" + rf + " (index, sums) VALUES (" + str(row) + ",0)")
for eachone in pcur.fetchall():
#-------------------------------------------- the statement below gives the syntax error
acur.execute("UPDATE TABLE " + "pf" + rf + " SET sums = sums + "\
+"ST_Contains(" + " ' " + each[0] + " ' " + ", " + " ' " + eachone[0] + " ' " + ")::int WHERE index = " + str(row))
row = row + 1
def parallelJoin (pointsTable, rectsTable, outputTable, outputPath, openConnection):
#Implement ParallelJoin Here.
cursor = openConnection.cursor()
cursor.execute("SELECT COUNT(*) FROM " + pointsTable)
size_data = (cursor.fetchall())[0][0]
for number in range(1, 5):
cursor.execute("DROP TABLE IF EXISTS pf" + str(number))
cursor.execute("CREATE TABLE pf" + str(number) + " AS SELECT * FROM "
+ pointsTable + " LIMIT " + str(size_data/4)
+ " OFFSET " + str(((number-1)*size_data)/4))
cursor.execute("SELECT COUNT(*) FROM " + rectsTable)
size_rects = (cursor.fetchall())[0][0]
for number in range(1, 5):
cursor.execute("DROP TABLE IF EXISTS rf" + str(number))
cursor.execute("CREATE TABLE rf" + str(number) + " AS SELECT * FROM "
+ pointsTable + " LIMIT " + str(size_rects/4)
+ " OFFSET " + str(((number - 1) * size_rects)/4))
threads = dict()
for number in range(0, 4):
threads[number] = threading.Thread(target=corefunc, args=("rf" + str(number + 1), openConnection))
threads[number].start()
break
while threads[0].is_alive() or threads[1].is_alive()\
or threads[2].is_alive() or threads[3].is_alive(): pass
# more shit to do
Nevermind, I just thought checking out the syntax from somewhere and I found the problem. To update table, the query begins with "UPDATE...." not "UPDATE TABLE.....".

How to get new line

How can I print a new line on the output file? When I try to add the new line with "/n" it just prints /n
This is what I have so far.
``
inputFile = open("demofile1.txt", "r")
outFile = open("Ji
string = line.split(',')
go =(string)[3::]
bo = [float(i) for i in go]
total = sum(bo)
pine = ("%8.2f"%total)
name = string[2] + "," + " " + string[1]
kale = (string[0] + " " + name + " " + "/n")
se)
Current Result
8
53 Baul
A999999
You need to use \n, not /n. So this line:
kale = (string[0] + " " + name + " " + "/n")
Should be:
kale = (string[0] + " " + name + " " + "\n")
Also, please do consider using a str formatter, so all these lines:
go =(string)[3::]
bo = [float(i) for i in go]
total = sum(bo)
pine = ("%8.2f"%total)
name = string[2] + "," + " " + string[1]
kale = (string[0] + " " + name + " " + "/n")
str1 = ''.join(kale)
str2 = ''.join(pine)
outFile.write(str1 + " " + str2 + " ")
Will become:
outFile.write("{} {} {:8.2f}\n".format(string[0], string[2] + ", " + string[1], sum(bo))

Python - key error when using "if in dict"

I am receiving the following error when running a script to parse contents of an XML file.
if iteration.findtext("Iteration_query-def") in ecdict:
KeyError: 'XLOC_000434'
I was under the impression that using "if in dict" would mean that if the key is not found in the dictionary, the script will continue past the if statement and proceed with the rest of the code. Below is the problematic section of the code I am using. I realise this is quite a basic question, but I am unsure what else I can say, and I don't understand why I am receiving this error.
import xml.etree.ElementTree as ET
tree = ET.parse('507.FINAL_14.2.14_2_nr.out_fmt5.out')
blast_iteration = tree.find("BlastOutput_iterations")
for iteration in blast_iteration.findall("Iteration"):
query = iteration.findtext("Iteration_query-def").strip().strip("\n")
if query in score:
continue
if iteration.findtext("Iteration_message") == "No hits found":
if iteration.findtext("Iteration_query-def") in tair:
tairid = tair[iteration.findtext("Iteration_query-def")][0]
tairdes = tair[iteration.findtext("Iteration_query-def")][1]
else:
tairid = "-"
tairdes = "-"
goterms = ""
ecterms = ""
if iteration.findtext("Iteration_query-def") in godict:
for x in godict[iteration.findtext("Iteration_query-def")][:-1]:
goterms = goterms + x + ";"
goterms = goterms + godict[iteration.findtext("Iteration_query-def")][-1]
else:
goterms = "-"
if iteration.findtext("Iteration_query-def") in ecdict:
for x in ecdict[iteration.findtext("Iteration_query-def")][:-1]:
ecterms = ecterms + x + ";"
ecterms = ecterms + ecdict[iteration.findtext("Iteration_query-def")][-1]
else:
ecterms = "-"
if iteration.findtext("Iteration_query-def") in godescr:
desc = godescr[iteration.findtext("Iteration_query-def")]
else:
desc = "-"
n += 1
p = "PvOAK_up"+str(n) + "\t" + tranlen[iteration.findtext("Iteration_query-def")] + "\t" + orflen[iteration.findtext("Iteration_query-def")] + "\t" + "-" + "\t" + "-" + "\t" + tairid + "\t" + tairdes + "\t" + goterms + "\t" + ecterms + "\t" + desc + "\t" + str(flower[query][2]) + "\t" + str('{0:.2e}'.format(float(flower[query][1]))) + "\t" + str('{0:.2f}'.format(float(flower[query][0]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][2]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][1]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][0])))
print p
Hope you can help,
Thanks.
edit: I should say that godict and ecdict were previously created as follows - I can submit the entire code if needs be:
godict = {}
ecdict = {}
godescr = {}
f = open("507.FINAL_14.2.14_2_nr.out_fmt5.out.annot")
for line in f:
line = line.split("\t")
if len(line) > 2:
godescr[line[0]] = line[2]
line[1] = line[1].strip("\n")
if line[1].startswith("EC"):
if line[0] in ecdict:
a = ecdict[line[0]]
a.append(line[1])
ecdict[line[0]] = a
else:
ecdict[line[0]] = [line[1]]
else:
if line[0] in godict:
a = godict[line[0]]
a.append(line[1])
godict[line[0]] = a
else:
godict[line[0]] = [line[1]]
Traceback:
Traceback (most recent call last):
File "2d.test.py", line 170, in <module>
p = "PvOAK_up"+str(n) + "\t" + tranlen[iteration.findtext("Iteration_query-def")] + "\t" + orflen[iteration.findtext("Iteration_query-def")] + "\t" + "-" + "\t" + "-" + "\t" + tairid + "\t" + tairdes + "\t" + goterms + "\t" + ecterms + "\t" + desc + "\t" + str(flower[query][2]) + "\t" + str('{0:.2e}'.format(float(flower[query][1]))) + "\t" + str('{0:.2f}'.format(float(flower[query][0]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][2]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][1]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][0])))
KeyError: 'XLOC_000434'

Categories

Resources