I'm trying to analyze a sqlite3 file and printing the results to a text file. If i test the code with print it all works fine. When i write it to a file it cuts out at the same point every time.
import sqlite3
import datetime
import time
conn = sqlite3.connect("History.sqlite")
curs = conn.cursor()
results = curs.execute("SELECT visits.id, visits.visit_time, urls.url, urls.visit_count \
FROM visits INNER JOIN urls ON urls.id = visits.url \
ORDER BY visits.id;")
exportfile = open('chrome_report.txt', 'w')
for row in results:
timestamp = row[1]
epoch_start = datetime.datetime(1601,1,1)
delta = datetime.timedelta(microseconds=int(timestamp))
fulltime = epoch_start + delta
string = str(fulltime)
timeprint = string[:19]
exportfile.write("ID: " + str(row[0]) + "\t")
exportfile.write("visit time: " + str(timeprint) + "\t")
exportfile.write("Url: " + str(row[2]) + "\t")
exportfile.write("Visit count: " + str(row[3]))
exportfile.write("\n")
print "ID: " + str(row[0]) + "\t"
print "visit time: " + str(timeprint) + "\t"
print "Url: " + str(row[2]) + "\t"
print "Visit count: " + str(row[3])
print "\n"
conn.close()
So the print results give the proper result but the export to the file stops in the middle of a url.
OK, I would start by replacing the for loop with the one below
with open('chrome_report.txt', 'w') as exportfile:
for row in results:
try:
timestamp = row[1]
epoch_start = datetime.datetime(1601,1,1)
delta = datetime.timedelta(microseconds=int(timestamp))
fulltime = epoch_start + delta
string = str(fulltime)
timeprint = string[:19]
exportfile.write("ID: " + str(row[0]) + "\t")
exportfile.write("visit time: " + str(timeprint) + "\t")
exportfile.write("Url: " + str(row[2]) + "\t")
exportfile.write("Visit count: " + str(row[3]))
exportfile.write("\n")
print "ID: " + str(row[0]) + "\t"
print "visit time: " + str(timeprint) + "\t"
print "Url: " + str(row[2]) + "\t"
print "Visit count: " + str(row[3])
print "\n"
except Exception as err:
print(err)
By using the "with" statement (context manager) we eliminate the need to close the file. By using the try/except we capture the error and print it. This will show you where your code is failing and why.
Related
def complete_stage_purge_process(self, target_cnxn, stage_table, process_cd):
self.logger.debug(datetime.now())
self.logger.debug('complete_stage_purge_process')
delete_dt = datetime.today() - timedelta(days=30)
delete_dt = str(delete_dt)
run_pk_sql = "select run_pk from " + schemaName.PROCESS.value + "." + tableName.RUN_LOG.value + " where " + ProcessRunlog.ETL_MODIFIED_DTM.value + " <= '" + delete_dt + "' and " + \
ProcessRunlog.PROCESS_PK.value + " = (select " + ProcessRunlog.PROCESS_PK.value + " from " + schemaName.PROCESS.value + "." + \
tableName.PROCESS.value + " where " + \
Process.PROCESS_CODE.value + " = '" + process_cd + "') "
delete_sql = "delete from " + schemaName.STAGE.value + "." + stage_table + " where run_pk in (" + run_pk_sql + ")"
print(delete_sql)
print(target_cnxn)
try:
trgt_cursor = target_cnxn.cursor()
trgt_cursor.execute(delete_sql)
self.logger.debug("deletes processed successfully ")
except:
self.logger.exception('Error in processing deletes')
raise
But when added commit after trgt_cursor.execute(delete_sql) then below error is thrown. Could someone please help on how to handle this
AttributeError: 'psycopg2.extensions.cursor' object has no attribute 'commit'
I am executing PostgreSQL13 queries coding them in python 3.9 using the psycopg2 library. I also am working with PostGIS extension over PostgreSQL.
Kindly look for the comment which points out the line which causes the syntax error. I am having trouble both understanding what is the syntax error and how to debug it since I need to execute PostgreSQL queries using python so any tips will be greatly appreciated.
def corefunc(rf, openConnection):
pcur = openConnection.cursor(name="pcur" + rf)
rcur = openConnection.cursor(name="rcur" + rf)
acur = openConnection.cursor()
rcur.execute("SELECT geom FROM " + rf)
for number in range (1, 5):
acur.execute("DROP TABLE IF EXISTS " + "pf"+ rf)
acur.execute("CREATE TABLE " + "pf" + rf + " (index integer, sums integer)")
pcur.execute("SELECT geom FROM " + "pf" + str(number))
row = 1
for each in rcur.fetchall():
if number == 1: acur.execute("INSERT INTO " + "pf" + rf + " (index, sums) VALUES (" + str(row) + ",0)")
for eachone in pcur.fetchall():
#-------------------------------------------- the statement below gives the syntax error
acur.execute("UPDATE TABLE " + "pf" + rf + " SET sums = sums + "\
+"ST_Contains(" + " ' " + each[0] + " ' " + ", " + " ' " + eachone[0] + " ' " + ")::int WHERE index = " + str(row))
row = row + 1
def parallelJoin (pointsTable, rectsTable, outputTable, outputPath, openConnection):
#Implement ParallelJoin Here.
cursor = openConnection.cursor()
cursor.execute("SELECT COUNT(*) FROM " + pointsTable)
size_data = (cursor.fetchall())[0][0]
for number in range(1, 5):
cursor.execute("DROP TABLE IF EXISTS pf" + str(number))
cursor.execute("CREATE TABLE pf" + str(number) + " AS SELECT * FROM "
+ pointsTable + " LIMIT " + str(size_data/4)
+ " OFFSET " + str(((number-1)*size_data)/4))
cursor.execute("SELECT COUNT(*) FROM " + rectsTable)
size_rects = (cursor.fetchall())[0][0]
for number in range(1, 5):
cursor.execute("DROP TABLE IF EXISTS rf" + str(number))
cursor.execute("CREATE TABLE rf" + str(number) + " AS SELECT * FROM "
+ pointsTable + " LIMIT " + str(size_rects/4)
+ " OFFSET " + str(((number - 1) * size_rects)/4))
threads = dict()
for number in range(0, 4):
threads[number] = threading.Thread(target=corefunc, args=("rf" + str(number + 1), openConnection))
threads[number].start()
break
while threads[0].is_alive() or threads[1].is_alive()\
or threads[2].is_alive() or threads[3].is_alive(): pass
# more shit to do
Nevermind, I just thought checking out the syntax from somewhere and I found the problem. To update table, the query begins with "UPDATE...." not "UPDATE TABLE.....".
This is my file text:
Covid-19 Data
Country / Number of infections / Number of Death
USA 124.356 2.236
Netherlands 10.866 771
Georgia 90 NA
Germany 58.247 455
I created a function to calculate the ratio of deaths compared to the infections, however it does not work, because some of the values aren't floats.
f=open("myfile.txt","w+")
x="USA" + " " + " " + "124.356" + " " + " " + "2.236"
y="Netherlands" + " " + " " + "10.866" + " " + " " + "771"
z="Georgia" + " " + " " + "90" + " " + " " + "NA"
w="Germany" + " " + " " + "58.247" + " " + " " + "455"
f.write("Covid-19 Data" + "\n" + "Country" + " " + "/" + " " + "Number of infections" + " " + "/" + " " + "Number of Death" + "\n")
f.write(x + "\n")
f.write(y + "\n")
f.write(z + "\n")
f.write(w)
f.close()
with open("myfile.txt", "r") as file:
try:
for i in file:
t = i.split()
result=float(t[-1])/float(t[-2])
print(results)
except:
print("fail")
file.close()
Does someone have an idea how to solve this problem ?
You can do the following:
with open("myfile.txt", "r") as file:
for i in file:
t = i.split()
try:
result = float(t[-1]) / float(t[-2])
print(result)
except ValueError:
pass
At the time you don't know if the values you are trying to divide are numeric values or not, therefore surrounding the operation with a try-catch should solve your problem.
If you want to become a bit more "clean" you can do the following:
def is_float(value):
try:
float(value)
except ValueError:
return False
return True
with open("myfile.txt", "r") as file:
for i in file:
t = i.split()
if is_float(t[-1]) and is_float(t[-2]):
result = float(t[-1]) / float(t[-2])
print(result)
The idea is the same, however.
I used the same file that you attached in your example. I created this function hopefully it helps:
with open("test.txt","r") as reader:
lines = reader.readlines()
for line in lines[2:]:
line = line.replace(".","") # Remove points to have the full value
country, number_infections, number_deaths = line.strip().split()
try:
number_infections = float(number_infections)
number_deaths = float(number_deaths)
except Exception as e:
print(f"[WARNING] Could not convert Number of Infections {number_infections} or Number of Deaths {number_deaths} to float for Country: {country}\n")
continue
ratio = number_deaths/number_infections
print(f"Country: {country} D/I ratio: {ratio}")
As you can see I avoided the headers of your file using lines[2:] that means that I will start from row 3 of your file. Also, added try/exception logic to avoid non-float converts. Hope this helps!
Edit
Just noticed that the format for thousands is used with "." instead "," in that case the period was removed in line 7.
The results for this execution is:
Country: USA D/I ratio: 0.017980636237897647
Country: Netherlands D/I ratio: 0.07095527332965212
[WARNING] Could not convert Number of Infections 90.0 or Number of Deaths NA to float for Country: Georgia
Country: Germany D/I ratio: 0.007811561110443456
Fixed the following:
The first two lines in your text-file are headers. These need to be skipped
'NA' Can't be converted to zero
If there is a 0 in your data, your program would crash. Now it wouldn't.
f=open("myfile.txt","w+")
x="USA" + " " + " " + "124.356" + " " + " " + "2.236"
y="Netherlands" + " " + " " + "10.866" + " " + " " + "771"
z="Georgia" + " " + " " + "90" + " " + " " + "NA"
w="Germany" + " " + " " + "58.247" + " " + " " + "455"
f.write("Covid-19 Data" + "\n" + "Country" + " " + "/" + " " + "Number of infections" + " " + "/" + " " + "Number of Death" + "\n")
f.write(x + "\n")
f.write(y + "\n")
f.write(z + "\n")
f.write(w)
f.close()
with open("myfile.txt", "r") as file:
#Skipping headers
next(file)
next(file)
try:
for i in file:
t = i.split()
#Make sure your code keeps working when one of the numbers is zero
x = 0
y = 0
#There are some NA's in your file. Strings not representing
#a number can't be converted to float
if t[1] != "NA":
x = t[1]
if t[2] != "NA":
y = t[2]
if x == 0 or y == 0:
result = 0
else:
result=float(x)/float(y)
print(t[0] + ": " + str(result))
except:
print("fail")
file.close()
Output:
USA: 55.615384615384606
Netherlands: 0.014093385214007782
Georgia: 0
Germany: 0.12801538461538461
Your header line in the file is Covid-19 Data. this is the first line and when you call t=i.split() you then have a list t which has data ['Covid-19', 'Data']
you cannot convert these to floats since they have letters in them. Instead you should read the first 2 header line before the loop and do nothing with them. However you are then going to have issues with Georgia as "NA" also cannot be converted to a float.
A few other points, its not good practice to have a catch all exception. Also you dont need to close the file explicitly if you open the file using a with statement.
How can I print a new line on the output file? When I try to add the new line with "/n" it just prints /n
This is what I have so far.
``
inputFile = open("demofile1.txt", "r")
outFile = open("Ji
string = line.split(',')
go =(string)[3::]
bo = [float(i) for i in go]
total = sum(bo)
pine = ("%8.2f"%total)
name = string[2] + "," + " " + string[1]
kale = (string[0] + " " + name + " " + "/n")
se)
Current Result
8
53 Baul
A999999
You need to use \n, not /n. So this line:
kale = (string[0] + " " + name + " " + "/n")
Should be:
kale = (string[0] + " " + name + " " + "\n")
Also, please do consider using a str formatter, so all these lines:
go =(string)[3::]
bo = [float(i) for i in go]
total = sum(bo)
pine = ("%8.2f"%total)
name = string[2] + "," + " " + string[1]
kale = (string[0] + " " + name + " " + "/n")
str1 = ''.join(kale)
str2 = ''.join(pine)
outFile.write(str1 + " " + str2 + " ")
Will become:
outFile.write("{} {} {:8.2f}\n".format(string[0], string[2] + ", " + string[1], sum(bo))
This script was created by an ex-lab member that was quite a bit more adapt at Python scripting than I am.
I am attempting to find Cooccupancy between annotated peaks in "exon" regions of the entire human h19 genome. However, after trying to get this to run for about an hour I am looking for help.
Here is the script:
#!/usr/bin/python
import math
import sys
import re
import csv
import MySQLdb
import itertools
import argparse
# format for execution: ./findCooccupancy.py <loci file> <comma separated list of marks to check> <window size> <outputfile>
# example: ./findCooccupancy.py AllGenes.txt PolII-ChIP,KAP1-ChIP,Hexim 150 output.txt
# format of loci file:
# chr2 12345678 12345900 GENEA 1 +
# chr4 987654321 98765000 GENEB 1 -
# etc...
locifile = sys.argv[1]
marks = sys.argv[2]
window = int(sys.argv[3])
outputfile = sys.argv[4]
loci = list(csv.reader(open(locifile, 'rb'),delimiter='\t'))
#loci = list(itertools.chain.from_iterable(loci))
db = MySQLdb.connect(host="localhost",user="snrnp",passwd="snrnp",db="snrnp")
cur = db.cursor()
cntdict = {}
for mark in marks.split(","):
cntdict[mark] = []
counter = 1
for locus in loci:
print "Working on line# " + str(counter)
counter += 1
if str(locus[5]) == "+":
exon = locus[1]
else:
exon = locus[2]
for mark in marks.split(","):
# this is incredibly dirty. sorry. I don't have time to do this better
if mark == 'PolII-ChIP':
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")")
#print "select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")"
else:
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and ((chr_start < " + str(exon) + " and chr_end > " + str(exon) + ") or (abs(chr_start - " + str(exon) + ") < " + str(window) + ") or (abs(chr_end - " + str(exon) + ") < " + str(window) + "))")
#print "select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and ((chr_start < " + str(exon) + " and chr_end > " + str(exon) + ") or (abs(chr_start - " + str(exon) + ") < " + str(window) + ") or (abs(chr_end - " + str(exon) + ") < " + str(window) + "))"
cnt = cur.fetchone()[0]
if cnt > 0:
cntdict[mark].append(",".join(locus))
convertedlist = []
for key in cntdict.keys():
convertedlist.append(cntdict[key])
intersectlist = set(convertedlist[0]).intersection(*convertedlist[1:])
for key in cntdict.keys():
print str(key) + " hits: " + str(len(cntdict[key]))
print "\nTotal Intersection Count: " + str(len(intersectlist))
with open(outputfile, 'w') as outputwriter:
for line in intersectlist:
outputwriter.write(line + "\n")
This is the command line that I have been using:
./findCooccupancy.py ~/code/snRNP/analysis/from\ sequencing/KEC_Project/Pol-IIAnnotatedPeaksGenome.txt PolII-ChIP 150 KECExonOccupancy.txt
This is the latest error message I have received:
Working on line# 1
Traceback (most recent call last):
File "./findCooccupancy.py", line 41, in <module>
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")")
File "/Library/Python/2.7/site-packages/MySQLdb/cursors.py", line 205, in execute
self.errorhandler(self, exc, value)
File "/Library/Python/2.7/site-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
_mysql_exceptions.OperationalError: (1054, "Unknown column 'Start' in 'where clause'")