Python - key error when using "if in dict" - python

I am receiving the following error when running a script to parse contents of an XML file.
if iteration.findtext("Iteration_query-def") in ecdict:
KeyError: 'XLOC_000434'
I was under the impression that using "if in dict" would mean that if the key is not found in the dictionary, the script will continue past the if statement and proceed with the rest of the code. Below is the problematic section of the code I am using. I realise this is quite a basic question, but I am unsure what else I can say, and I don't understand why I am receiving this error.
import xml.etree.ElementTree as ET
tree = ET.parse('507.FINAL_14.2.14_2_nr.out_fmt5.out')
blast_iteration = tree.find("BlastOutput_iterations")
for iteration in blast_iteration.findall("Iteration"):
query = iteration.findtext("Iteration_query-def").strip().strip("\n")
if query in score:
continue
if iteration.findtext("Iteration_message") == "No hits found":
if iteration.findtext("Iteration_query-def") in tair:
tairid = tair[iteration.findtext("Iteration_query-def")][0]
tairdes = tair[iteration.findtext("Iteration_query-def")][1]
else:
tairid = "-"
tairdes = "-"
goterms = ""
ecterms = ""
if iteration.findtext("Iteration_query-def") in godict:
for x in godict[iteration.findtext("Iteration_query-def")][:-1]:
goterms = goterms + x + ";"
goterms = goterms + godict[iteration.findtext("Iteration_query-def")][-1]
else:
goterms = "-"
if iteration.findtext("Iteration_query-def") in ecdict:
for x in ecdict[iteration.findtext("Iteration_query-def")][:-1]:
ecterms = ecterms + x + ";"
ecterms = ecterms + ecdict[iteration.findtext("Iteration_query-def")][-1]
else:
ecterms = "-"
if iteration.findtext("Iteration_query-def") in godescr:
desc = godescr[iteration.findtext("Iteration_query-def")]
else:
desc = "-"
n += 1
p = "PvOAK_up"+str(n) + "\t" + tranlen[iteration.findtext("Iteration_query-def")] + "\t" + orflen[iteration.findtext("Iteration_query-def")] + "\t" + "-" + "\t" + "-" + "\t" + tairid + "\t" + tairdes + "\t" + goterms + "\t" + ecterms + "\t" + desc + "\t" + str(flower[query][2]) + "\t" + str('{0:.2e}'.format(float(flower[query][1]))) + "\t" + str('{0:.2f}'.format(float(flower[query][0]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][2]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][1]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][0])))
print p
Hope you can help,
Thanks.
edit: I should say that godict and ecdict were previously created as follows - I can submit the entire code if needs be:
godict = {}
ecdict = {}
godescr = {}
f = open("507.FINAL_14.2.14_2_nr.out_fmt5.out.annot")
for line in f:
line = line.split("\t")
if len(line) > 2:
godescr[line[0]] = line[2]
line[1] = line[1].strip("\n")
if line[1].startswith("EC"):
if line[0] in ecdict:
a = ecdict[line[0]]
a.append(line[1])
ecdict[line[0]] = a
else:
ecdict[line[0]] = [line[1]]
else:
if line[0] in godict:
a = godict[line[0]]
a.append(line[1])
godict[line[0]] = a
else:
godict[line[0]] = [line[1]]
Traceback:
Traceback (most recent call last):
File "2d.test.py", line 170, in <module>
p = "PvOAK_up"+str(n) + "\t" + tranlen[iteration.findtext("Iteration_query-def")] + "\t" + orflen[iteration.findtext("Iteration_query-def")] + "\t" + "-" + "\t" + "-" + "\t" + tairid + "\t" + tairdes + "\t" + goterms + "\t" + ecterms + "\t" + desc + "\t" + str(flower[query][2]) + "\t" + str('{0:.2e}'.format(float(flower[query][1]))) + "\t" + str('{0:.2f}'.format(float(flower[query][0]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][2]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][1]))) + "\t" + str('{0:.2f}'.format(float(leaf[query][0])))
KeyError: 'XLOC_000434'

Related

Below piece of code is not deleting records but the records are deleted from the query when run manually

def complete_stage_purge_process(self, target_cnxn, stage_table, process_cd):
self.logger.debug(datetime.now())
self.logger.debug('complete_stage_purge_process')
delete_dt = datetime.today() - timedelta(days=30)
delete_dt = str(delete_dt)
run_pk_sql = "select run_pk from " + schemaName.PROCESS.value + "." + tableName.RUN_LOG.value + " where " + ProcessRunlog.ETL_MODIFIED_DTM.value + " <= '" + delete_dt + "' and " + \
ProcessRunlog.PROCESS_PK.value + " = (select " + ProcessRunlog.PROCESS_PK.value + " from " + schemaName.PROCESS.value + "." + \
tableName.PROCESS.value + " where " + \
Process.PROCESS_CODE.value + " = '" + process_cd + "') "
delete_sql = "delete from " + schemaName.STAGE.value + "." + stage_table + " where run_pk in (" + run_pk_sql + ")"
print(delete_sql)
print(target_cnxn)
try:
trgt_cursor = target_cnxn.cursor()
trgt_cursor.execute(delete_sql)
self.logger.debug("deletes processed successfully ")
except:
self.logger.exception('Error in processing deletes')
raise
But when added commit after trgt_cursor.execute(delete_sql) then below error is thrown. Could someone please help on how to handle this
AttributeError: 'psycopg2.extensions.cursor' object has no attribute 'commit'

Appending to array using pool

I am trying to scrape data from soccerway.com and checking whether the page is a completed game/game to be played with each instance being written to a seperate csv file. I am running through 10,000 pages and so have written it using Pools. However, I am getting empty lists from the append function and cannot write anything to the csv files.
I tried writing straight to the file instead of list appending however this gave incomplete files
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import uuid
import time
from multiprocessing import Pool
import sys, os
fixturesA = []
linksA = []
statsA = []
def parse(url):
try:
#print(url)
delays = [0.25,0.5,0.75,1]
delay = np.random.choice(delays)
#time.sleep(delay)
#r = requests.get(url)
r = requests.get(url, timeout = 10)
soup = BeautifulSoup(r.content, "html.parser")
teams = soup.findAll('h3', attrs = {'class' : 'thick'})
homeTeam = teams[0].text.strip()
awayTeam = teams[2].text.strip()
middle = teams[1].text.strip()
dds = soup.findAll('dd')
date = dds[1].text.strip()
gameWeek = dds[2].text.strip()
if ':' not in middle:
middle = middle.split(" - ")
homeGoals = 0
awayGoals = 0
homeGoals = middle[0]
try:
awayGoals = middle[1]
except Exception as e:
homeGoals = "-1"
awayGoals = "-1"
matchGoals = int(homeGoals) + int(awayGoals)
if(matchGoals >= 0):
if(int(homeGoals) > 0 and int(awayGoals) > 0):
btts = "y"
else:
btts = "n"
halfTimeScore = dds[4].text.strip().split(" - ")
firstHalfHomeGoals = halfTimeScore[0]
firstHalfAwayConc = halfTimeScore[0]
firstHalfAwayGoals = halfTimeScore[1]
firstHalfHomeConc = halfTimeScore[1]
firstHalfTotalGoals = int(firstHalfHomeGoals) + int(firstHalfAwayGoals)
secondHalfHomeGoals = int(homeGoals) - int(firstHalfHomeGoals)
secondHalfAwayConc = int(homeGoals) - int(firstHalfHomeGoals)
secondHalfAwayGoals = int(awayGoals) - int(firstHalfAwayGoals)
secondHalfHomeConc = int(awayGoals) - int(firstHalfAwayGoals)
secondHalfTotalGoals = matchGoals - firstHalfTotalGoals
homeTeamContainers = soup.findAll('div', attrs = {'class' : 'container left'})
homeTeamStarting = homeTeamContainers[2]
homeTeamBench = homeTeamContainers[3]
homeTeamYellows = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
homeTeamReds = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
homeTeamCards = homeTeamYellows + homeTeamReds
awayTeamContainers = soup.findAll('div', attrs = {'class' : 'container right'})
awayTeamStarting = awayTeamContainers[2]
awayTeamBench = awayTeamContainers[3]
awayTeamYellows = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
awayTeamReds = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
awayTeamCards = awayTeamYellows + awayTeamReds
matchCards = homeTeamCards + awayTeamCards
try:
iframe = soup.findAll('iframe')
iframeSrc = iframe[1]['src']
url = 'https://us.soccerway.com/' + iframeSrc
c = requests.get(url,timeout = 10)
soupC = BeautifulSoup(c.content, "html.parser")
cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend left value'})
homeCorners = cornerContainer[0].text.strip()
awayCornersConc = homeCorners
cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend right value'})
awayCorners = cornerContainer[0].text.strip()
homeCornersConc = awayCorners
matchCorners = int(homeCorners) + int(awayCorners)
print("Got Score . " + homeTeam + " vs " + awayTeam+" . " + gameWeek )
statsA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + homeCorners + "," + awayCorners + "," + homeCornersConc + "," + awayCornersConc + "," + str(matchCorners)+","+dds[0].text.strip() + "\n")
return None
except Exception as e:
print("Got Score no corners. " + homeTeam + " vs " + awayTeam+" . " + gameWeek + " NO FRAME")
statsA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + "" + "," + "" + "," + "" + "," + "" + "," + ""+","+dds[0].text.strip() + "\n")
return None
else:
fixturesA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + date + "\n")
linksA.append(url + "\n")
print(homeTeam + " vs " + awayTeam + " at " + middle + " GW:" + gameWeek)
return None
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
linksA.append(url + "\n")
print(url)
return None
stats = open('Statsv2.csv','a',encoding='utf-8')
fixtures = open('fixturesv2.csv','w',encoding='utf-8')
with open('links.txt') as f:
content = f.readlines()
content = [x.strip() for x in content]
links = open('links.txt','w')
if __name__ == '__main__':
start_time = time.time()
p = Pool(20) # Pool tells how many at a time
records = p.map(parse, content)
p.terminate()
p.join()
print("--- %s seconds ---" % (time.time() - start_time))
I assume you are running Windows? Then the answer is that multi-processing in Windows creates copies instead of forks. So you have your main process with the lists and you get your working processes (from pool) with their own separate set of lists.
The workers most likely fill their list correctly, but the lists in the main-process don't get any data and so are staying empty. And the workers do not return anything. So, as you write your files in the main-process, you get empty files.
An easy way to solve this is creating pipes or queues between the main process and the workers to allow communication between the threads. You could also use shared arrays like they are provided by the multiprocessing class, but than you would need to know the length during creation.
see documentation: Multiprocessing
as pointed out by #RaJa you're not actually doing anything the parent/controlling process can see. the easiest is just to return values from the mapped function
for example, parse() could return tuple at the end like:
def parse(url):
# do work
return url, homeTeam, awayTeam, gameWeek, homeGoals, awayGoals # ...
then the parent process can receive the values and do useful things like saving them to a CSV file:
import csv
with Pool(20) as pool:
records = pool.map(parse, content)
with open('stats.csv', 'w') as fd:
out = csv.writer(fd)
out.writerow([
'url', 'hometeam', 'awayteam',
# and the remaining column names for the header
])
out.writerows(records)

How to get new line

How can I print a new line on the output file? When I try to add the new line with "/n" it just prints /n
This is what I have so far.
``
inputFile = open("demofile1.txt", "r")
outFile = open("Ji
string = line.split(',')
go =(string)[3::]
bo = [float(i) for i in go]
total = sum(bo)
pine = ("%8.2f"%total)
name = string[2] + "," + " " + string[1]
kale = (string[0] + " " + name + " " + "/n")
se)
Current Result
8
53 Baul
A999999
You need to use \n, not /n. So this line:
kale = (string[0] + " " + name + " " + "/n")
Should be:
kale = (string[0] + " " + name + " " + "\n")
Also, please do consider using a str formatter, so all these lines:
go =(string)[3::]
bo = [float(i) for i in go]
total = sum(bo)
pine = ("%8.2f"%total)
name = string[2] + "," + " " + string[1]
kale = (string[0] + " " + name + " " + "/n")
str1 = ''.join(kale)
str2 = ''.join(pine)
outFile.write(str1 + " " + str2 + " ")
Will become:
outFile.write("{} {} {:8.2f}\n".format(string[0], string[2] + ", " + string[1], sum(bo))

Problems with incrementing location of widget creation when function is called

I am trying to make this function create a label and 2 buttons respectively, and each time this function is called, 3the widgets will be created on the next row(directly under the previous 3 widgets).
Howver, I am not sure why the items keep being created on the same row (effectively overlapping over the same one when the function is called) despite the counter being incremented.
def fetch_quick(self, entries):
for entry in entries:
text = entry[1].get()
print(text)
exec("app._framea" + str(self._qqq+7) + "= tk.Frame(app._master, bg='white')")
exec("app._framea" + str(self._qqq+7) + ".grid(row=" + str(self._qqq+6) + ")")
exec("self.queue_entry_quick" + str(self._qqq) + " = Label(app._framea" + str(self._qqq+7) + ", text='1 '+text +' 0 a few seconds ago')")
exec("self.queue_entry_quick" + str(self._qqq) + ".grid(row=" + str(self._qqq) + ")")
exec("self._Button" + str(self._qqq) + " = Button(app._framea" + str(self._qqq+7) + ", text = self._qqq, width = 2, command=app._framea" + str(self._qqq+7) + ".destroy, bg='red')")
exec("self._Buttonb" + str(self._qqq) + " = Button(app._framea" + str(self._qqq+7) + ", text = self._qqq, width = 2, command=app._framea" + str(self._qqq+7) + ".destroy, bg='green')")
exec("self._Button" + str(self._qqq) + ".grid(row=" + str(self._qqq) + ", column=1)")
exec("self._Button" + str(self._qqq) + ".bind('<Button-1>',self.call)")
exec("self._Buttonb" + str(self._qqq) + ".grid(row=" + str(self._qqq) + ", column=2)")
exec("self._Buttonb" + str(self._qqq) + ".bind('<Button-1>',self.call)")
abcd.append(text)
self._qqq += 1
print(self._qqq)
I think it might be the issue of the widgets' creation location(on the grid) being pre-set to row 0 for all of the widgets and thus it will not update self._qqq for each time the function is called. If this is the case, I am still unsure of what to do about it

Attempting to find Cooccupancy using a custom python script

This script was created by an ex-lab member that was quite a bit more adapt at Python scripting than I am.
I am attempting to find Cooccupancy between annotated peaks in "exon" regions of the entire human h19 genome. However, after trying to get this to run for about an hour I am looking for help.
Here is the script:
#!/usr/bin/python
import math
import sys
import re
import csv
import MySQLdb
import itertools
import argparse
# format for execution: ./findCooccupancy.py <loci file> <comma separated list of marks to check> <window size> <outputfile>
# example: ./findCooccupancy.py AllGenes.txt PolII-ChIP,KAP1-ChIP,Hexim 150 output.txt
# format of loci file:
# chr2 12345678 12345900 GENEA 1 +
# chr4 987654321 98765000 GENEB 1 -
# etc...
locifile = sys.argv[1]
marks = sys.argv[2]
window = int(sys.argv[3])
outputfile = sys.argv[4]
loci = list(csv.reader(open(locifile, 'rb'),delimiter='\t'))
#loci = list(itertools.chain.from_iterable(loci))
db = MySQLdb.connect(host="localhost",user="snrnp",passwd="snrnp",db="snrnp")
cur = db.cursor()
cntdict = {}
for mark in marks.split(","):
cntdict[mark] = []
counter = 1
for locus in loci:
print "Working on line# " + str(counter)
counter += 1
if str(locus[5]) == "+":
exon = locus[1]
else:
exon = locus[2]
for mark in marks.split(","):
# this is incredibly dirty. sorry. I don't have time to do this better
if mark == 'PolII-ChIP':
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")")
#print "select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")"
else:
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and ((chr_start < " + str(exon) + " and chr_end > " + str(exon) + ") or (abs(chr_start - " + str(exon) + ") < " + str(window) + ") or (abs(chr_end - " + str(exon) + ") < " + str(window) + "))")
#print "select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and ((chr_start < " + str(exon) + " and chr_end > " + str(exon) + ") or (abs(chr_start - " + str(exon) + ") < " + str(window) + ") or (abs(chr_end - " + str(exon) + ") < " + str(window) + "))"
cnt = cur.fetchone()[0]
if cnt > 0:
cntdict[mark].append(",".join(locus))
convertedlist = []
for key in cntdict.keys():
convertedlist.append(cntdict[key])
intersectlist = set(convertedlist[0]).intersection(*convertedlist[1:])
for key in cntdict.keys():
print str(key) + " hits: " + str(len(cntdict[key]))
print "\nTotal Intersection Count: " + str(len(intersectlist))
with open(outputfile, 'w') as outputwriter:
for line in intersectlist:
outputwriter.write(line + "\n")
This is the command line that I have been using:
./findCooccupancy.py ~/code/snRNP/analysis/from\ sequencing/KEC_Project/Pol-IIAnnotatedPeaksGenome.txt PolII-ChIP 150 KECExonOccupancy.txt
This is the latest error message I have received:
Working on line# 1
Traceback (most recent call last):
File "./findCooccupancy.py", line 41, in <module>
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")")
File "/Library/Python/2.7/site-packages/MySQLdb/cursors.py", line 205, in execute
self.errorhandler(self, exc, value)
File "/Library/Python/2.7/site-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
_mysql_exceptions.OperationalError: (1054, "Unknown column 'Start' in 'where clause'")

Categories

Resources