Does anyone knows why I have this error on output of my Python document? I'm using lxml element etree.
The error shows up when I define the functions
...............................................................................................................................................................
List all purchases of the year chosen by the user:
Buy: 2017/12/01, Delivered, Ship all over as each product is available.
Traceback (most recent call last):
File "D: \ ABC \ 1stan \ 1st Semester \ TI \ tp3 \ python \ ola.py", line
55, in <module>
printCompras (treeDoc, "2017", prefix)
File "D: \ ABC \ 1styear \ 1st Semester \ TI \ tp3 \ python \ ola.py",
line 20, in printCompras
printCompra (compraElem, prefix)
File "D: \ ABC \ 1styear \ 1st Semester \ TI \ tp3 \ python \ ola.py",
line 34, in printCompra
"+", "+ buy_id [0] .text +", "+ buy_id [0] .text +", "+ buy_id [0] .text
+ )
IndexError: list index out of range
My Python file:
from lxml import etree
file = "loja.xml"
treeDoc = etree.parse (file)
buyElem = treeDoc.xpath ("// purchase")
listUsers = treeDoc.xpath ("// user")
productElem = treeDoc.xpath ("// product")
characteristicaElem = treeDoc.xpath ("// product / technical_characters /
feature")
olamundo = treeDoc.xpath ("// purchase / product_bought")
print ("List all Purchases of the year chosen by the user:")
prefix = ""
def printCompras (treeDoc, year, prefix):
for purchase in compraElem:
buy_date = purchase.xpath ("purchase_date")
ano_data = (buy_date [0] .text) .split ("/")
buy_year = year_data [0]
if year == year_of_buy:
printCompra (compraElem, prefix)
def printCompra (compraElem, prefix):
for purchase in compraElem:
buyer_id = buyElem [0] .get ("buyer")
for user in listaUsers:
user_name = user.xpath ("# user_id =" + buyer_id + "/ name")
compra_id = compra.xpath ("compra_id")
buy_date = purchase.xpath ("purchase_date")
buy_account = buy.xpath ("buy_account") [0]
indication = purchase.xpath ("indication")
for purchase in compraElem:
print (prefix + "Buy:" + buy_date [0] .text + "," + buy_date.get
("situation") + "," + indication [0] .text)
"+", "+ buy_id [0] .text +", "+ buy_id [0] .text +", "+ buy_id [0]
.text + )
(+), "+", "+", "+", "+", "+", "+", "[ )
for product in productElem:
carrier = olamundo [0] .get ("carrier")
printProduct (productName, carrier, prefix)
def printProduct (productEle, carrier, prefix):
product_id = olamundo [0] .get ("# product_id")
product_name = treeDoc.xpath ("// buy [purchased_product / # product_id =]")
price = treeDoc.xpath ("// product bla bla bla")
print (prefix + "Product:" + product_id + "," + name [0] .text + "," + price
[0] .text + "," +
print (prefix + "Product:" + product_id + "," + olamundo [0] .get ("#
carrier"))
def printVender ():
identifier, name, average-rating-in-stars
def printCaracteristica (characteristicElement, prefix):
name = characterElement.get ("name")
value = characterElement.get ("value")
print (prefix + "Characteristics:" + name + "-" + value)
printCompras (treeDoc, "2017", prefix)
Related
My scraper bot working normally till I add merchant column in database. Scraper.py file scraping success merchant and recorded in database correctly but compare bot not compare after add merchant column
My original code:(compare success)
# The max_date a product can have in order to be considered "old"
limit = datetime.datetime.strptime(products[-1][1], "%Y-%m-%d %H:%M:%S.%f") + \
datetime.timedelta(minutes=OLDER_PRODUCTS_RANGE)
# Separate "old" from "new" products
for i, product in enumerate(products[::-1]):
date = datetime.datetime.strptime(
product[1], "%Y-%m-%d %H:%M:%S.%f")
index = len(products) - i - 1
if date > limit:
old_products = products[index+1:]
new_products = products[:index+1]
break
# If we have only one or even none of the lists, return
if len(new_products) == 0 or len(old_products) == 0:
c.close()
self.save_db(db)
return True
older_product = min(old_products, key=lambda x: x[5])
current_product = min(new_products, key=lambda x: x[5])
first_price = older_product[5]
current_price = current_product[5]
current_date = current_product[1]
url = current_product[6]
price_difference = first_price - current_price
percentage = price_difference / first_price
percentage_str = str("%.2f" % (percentage * 100))
# If the drop in the price was greater then the expected percentage, warn the user
if percentage >= self.percentage:
rowid = current_product[0]
product_name = current_product[4]
product_id = current_product[3]
print(
f"[Bot] [{current_date}] Price of \"{product_name}\" is {percentage_str}% off")
message = product_name + "\n\n" + \
str(first_price) + " TL >>>> " + \
str(current_price) + f" TL - {percentage_str}%" + "\n\n" + \
MAIN_URL + url + "\n\n" + \
MAIN_URL + "ara?q=" + product_id
context.bot.send_message(
chat_id=CHANNEL_ID,
text=message
)
c.execute(
"INSERT INTO deleted SELECT rowid FROM products WHERE product_id = %s AND rowid != %s;",
(product_id, rowid)
)
c.close()
self.save_db(db)
return True
My edited Code: (doesnt price compare)
# The max_date a product can have in order to be considered "old"
limit = datetime.datetime.strptime(products[-1][1], "%Y-%m-%d %H:%M:%S.%f") + \
datetime.timedelta(minutes=OLDER_PRODUCTS_RANGE)
# Separate "old" from "new" products
for i, product in enumerate(products[::-1]):
date = datetime.datetime.strptime(
product[1], "%Y-%m-%d %H:%M:%S.%f")
index = len(products) - i - 1
if date > limit:
old_products = products[index+1:]
new_products = products[:index+1]
break
# If we have only one or even none of the lists, return
if len(new_products) == 0 or len(old_products) == 0:
c.close()
self.save_db(db)
return True
older_product = min(old_products, key=lambda x: x[5])
current_product = min(new_products, key=lambda x: x[5])
first_price = older_product[5]
current_price = current_product[5]
current_date = current_product[1]
url = current_product[6]
merchant = current_product[7]
price_difference = first_price - current_price
percentage = price_difference / first_price
percentage_str = str("%.2f" % (percentage * 100))
# If the drop in the price was greater then the expected percentage, warn the user
if percentage >= self.percentage:
rowid = current_product[0]
product_name = current_product[4]
product_id = current_product[3]
print(
f"[Bot] [{current_date}] Price of \"{product_name}\" is {percentage_str}% off")
message = product_name + "\n\n" + \
+ "Satici Adi:" + merchant + "\n\n" + \
str(first_price) + " TL >>>> " + \
str(current_price) + f" TL - {percentage_str}%" + "\n\n" + \
MAIN_URL + url + "?magaza=" + merchant + "\n\n" + \
MAIN_URL + "ara?q=" + product_id
context.bot.send_message(
chat_id=CHANNEL_ID,
text=message
)
c.execute(
"INSERT INTO deleted SELECT rowid FROM products WHERE product_id = %s AND rowid != %s;",
(product_id, rowid)
)
c.close()
self.save_db(db)
return True
I use python with beautifulsoup4
Whats wrong?
I suppose the problem is you have extra + symbol here:
message = product_name + "\n\n" + \
+ "Satici Adi:" + merchant + "\n\n" + \
str(first_price) + " TL >>>> " + \
str(current_price) + f" TL - {percentage_str}%" + "\n\n" + \
MAIN_URL + url + "?magaza=" + merchant + "\n\n" + \
MAIN_URL + "ara?q=" + product_id
You should replace it by:
message = product_name + "\n\n" + \
"Satici Adi:" + merchant + "\n\n" + \
str(first_price) + " TL >>>> " + \
str(current_price) + f" TL - {percentage_str}%" + "\n\n" + \
MAIN_URL + url + "?magaza=" + merchant + "\n\n" + \
MAIN_URL + "ara?q=" + product_id
elif choice == "A":
def add_task(count):
with open ( 'user.txt' ) as fin:
usernames = [ i.split ( ',' ) [ 0 ] for i in fin.readlines ( ) if len ( i ) > 3 ]
task = input ( "Please enter the username of the person the task is assigned to.\n" )
while task not in usernames:
task = input ( "Username not registered. Please enter a valid username.\n" )
else:
task_title = input ( "Please enter the title of the task.\n" )
task_description = input ( "Please enter the task description.\n" )
task_due = input ( "Please input the due date of the task. (yyyy-mm-dd)\n" )
date = datetime.date.today ( )
task_completed = False
if task_completed == False:
task_completed = "No"
else:
task_completed = ("Yes")
with open ( 'tasks.txt' , 'a+' ) as task1:
count = count + 1
task1.write ( "\n User assigned to task: " , str (
count ) + "\n" + task + "\nTask Title :" + "\n" + task_title + "\n" + "Task Description:\n" + task_description + "\n" + "Task Due Date:\n" + str (
task_due ) + "\n" + "Date Assigned:\n" + str (
date ) + "\n" + "Task Completed:\n" + task_completed + "\n" )
file.close ( )
print ( "The new assigned task has been saved" )
count = 0
add_task ( count )
Replace
task1.write ( "\n User assigned to task: " , str (
count ) + "\n" + task + "\nTask Title :" + "\n" + task_title + "\n" + "Task Description:\n" + task_description + "\n" + "Task Due Date:\n" + str (
task_due ) + "\n" + "Date Assigned:\n" + str (
date ) + "\n" + "Task Completed:\n" + task_completed + "\n" )
With
task1.write ( "\n User assigned to task: " + str (
count ) + "\n" + task + "\nTask Title :" + "\n" + task_title + "\n" + "Task Description:\n" + task_description + "\n" + "Task Due Date:\n" + str (
task_due ) + "\n" + "Date Assigned:\n" + str (
date ) + "\n" + "Task Completed:\n" + task_completed + "\n" )
You concatenate the first two strings with a comma, while all other strings are concatenated by a + sign.
You have to change the task1.write line:
It should be:
task1.write ( "\n User assigned to task: " + str (count) + "\n" + task + "\nTask Title :" + "\n" + task_title + "\n" + "Task Description:\n" + task_description + "\n" + "Task Due Date:\n" + str (task_due) + "\n" + "Date Assigned:\n" + str (date ) + "\n" + "Task Completed:\n" + task_completed + "\n" )
My code throws error I really can't understand why. There it is:
File "alon.py", line 152
fig.write_image("files/table_" + product_name + ".pdf")
^
IndentationError: unindent does not match any outer indentation level
If I remove this line, it works. Can't see how it is unintended. It is under if type(product_data) is dict: statement. On the same level like the last line of code before it. What can cause such a behaviour ?
import MySQLdb
from plotly import graph_objs as go
import numpy as np
import os
from plotly.subplots import make_subplots
from PyPDF2 import PdfFileMerger
from datetime import datetime, timedelta
# Database connect
db = MySQLdb.connect(host="localhost",
user="root",
passwd="abc9110033969",
db="alon")
today = datetime.today().strftime('%Y-%m-%d')
one_week = (datetime.today() - timedelta(days=7)).strftime('%Y-%m-%d')
two_week = (datetime.today() - timedelta(days=14)).strftime('%Y-%m-%d')
three_week = (datetime.today() - timedelta(days=21)).strftime('%Y-%m-%d')
four_week = (datetime.today() - timedelta(days=28)).strftime('%Y-%m-%d')
# Functions
def load_post_views(table, today, one_week, two_week, three_week, four_week):
product_views_dict = dict()
cursor = db.cursor()
cursor.execute(
"SELECT client_id, product_id, referrer, `date`" +
" FROM " + table +
" WHERE `date`>='"+four_week+"'")
social_dict = {
"facebook": 0,
"twitter": 0,
"instagram": 0,
"linkedin": 0,
"pinterest": 0,
"website": 0,
}
for x in range(0, cursor.rowcount):
row = cursor.fetchone()
network = ""
period = ""
client_id = row[0]
product_id = row[1]
referrer = row[2]
date = str(row[3])
email_cursor = db.cursor()
email_cursor.execute("SELECT address FROM c8ty_connections_email WHERE entry_id=" + str(client_id))
email = email_cursor.fetchone()
product_cursor = db.cursor()
product_cursor.execute("SELECT post_title FROM c8ty_posts WHERE id=" + str(product_id))
product_name = product_cursor.fetchone()
# Add client ID key
if client_id not in product_views_dict:
product_views_dict[client_id] = dict()
# Add product ID key to client ID parent key
if product_id not in product_views_dict[client_id]:
product_views_dict[client_id][product_id] = {
today + " - " + one_week: social_dict,
one_week + " - " + two_week: social_dict,
two_week + " - " + three_week: social_dict,
three_week + " - " + four_week: social_dict
}
# Find referrer
if "facebook" in referrer:
network = "facebook"
elif "twitter" in referrer:
network = "twitter"
elif "instagram" in referrer:
network = "instagram"
elif "linkedin" in referrer:
network = "linkedin"
elif "pinterest" in referrer:
network = "pinterest"
else:
network = "website"
# Check view period
if date <= today and date > one_week:
period = today + " - " + one_week
if date <= one_week and date > two_week:
period = one_week + " - " + two_week
if date <= two_week and date > three_week:
period = two_week + " - " + three_week
if date <= three_week and date > four_week:
period = three_week + " - " + four_week
product_views_dict[client_id][product_id][period][network] += 1
product_views_dict[client_id]["email"] = email[0]
product_views_dict[client_id][product_id]["product"] = product_name[0]
return product_views_dict
# Init
product_views_dict = load_post_views("an_product_view", today, one_week, two_week, three_week, four_week)
brochure_views_dict = load_post_views("an_brochure_view", today, one_week, two_week, three_week, four_week)
for clinetID, product_info in product_views_dict.items():
client_email = product_info["email"]
for productID, product_data in product_info.items():
if type(product_data) is dict:
product_name = product_data['product']
table_data = [
[
today + " - " + one_week,
one_week + " - " + two_week,
two_week + " - " + three_week,
three_week + " - " + four_week,
today + " - " + four_week
]
]
for network in ["website", "facebook", "twitter", "instagram", "linkedin", "pinterest"]:
table_data.append([
product_data[today + " - " + one_week][network],
product_data[one_week + " - " + two_week][network],
product_data[two_week + " - " + three_week][network],
product_data[three_week + " - " + four_week][network],
sum([
int(product_data[today + " - " + one_week][network]),
int(product_data[one_week + " - " + two_week][network]),
int(product_data[two_week + " - " + three_week][network]),
int(product_data[three_week + " - " + four_week][network])
])
])
fig = make_subplots(rows=5, cols=2)
# Create product table
fig.add_trace(
go.Table(
header=dict(values=["Period", "Website", "Facebook", "Twitter", "Instagram", "LinkedIn", "Pinterest", "Total"]),
cells=dict(values=table_data)
)
)
# Craete folder if doesn't exist
if not os.path.exists("files"):
os.mkdir("files")
# Write pdf
fig.write_image("files/table_" + product_name + ".pdf")
db.close()
exit()
Check in your source file that you aren't mixing spaces and tabs? It should be consistent, and only one or the other. I recommend spaces to comply with PEP8.
I am trying to scrape data from soccerway.com and checking whether the page is a completed game/game to be played with each instance being written to a seperate csv file. I am running through 10,000 pages and so have written it using Pools. However, I am getting empty lists from the append function and cannot write anything to the csv files.
I tried writing straight to the file instead of list appending however this gave incomplete files
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import uuid
import time
from multiprocessing import Pool
import sys, os
fixturesA = []
linksA = []
statsA = []
def parse(url):
try:
#print(url)
delays = [0.25,0.5,0.75,1]
delay = np.random.choice(delays)
#time.sleep(delay)
#r = requests.get(url)
r = requests.get(url, timeout = 10)
soup = BeautifulSoup(r.content, "html.parser")
teams = soup.findAll('h3', attrs = {'class' : 'thick'})
homeTeam = teams[0].text.strip()
awayTeam = teams[2].text.strip()
middle = teams[1].text.strip()
dds = soup.findAll('dd')
date = dds[1].text.strip()
gameWeek = dds[2].text.strip()
if ':' not in middle:
middle = middle.split(" - ")
homeGoals = 0
awayGoals = 0
homeGoals = middle[0]
try:
awayGoals = middle[1]
except Exception as e:
homeGoals = "-1"
awayGoals = "-1"
matchGoals = int(homeGoals) + int(awayGoals)
if(matchGoals >= 0):
if(int(homeGoals) > 0 and int(awayGoals) > 0):
btts = "y"
else:
btts = "n"
halfTimeScore = dds[4].text.strip().split(" - ")
firstHalfHomeGoals = halfTimeScore[0]
firstHalfAwayConc = halfTimeScore[0]
firstHalfAwayGoals = halfTimeScore[1]
firstHalfHomeConc = halfTimeScore[1]
firstHalfTotalGoals = int(firstHalfHomeGoals) + int(firstHalfAwayGoals)
secondHalfHomeGoals = int(homeGoals) - int(firstHalfHomeGoals)
secondHalfAwayConc = int(homeGoals) - int(firstHalfHomeGoals)
secondHalfAwayGoals = int(awayGoals) - int(firstHalfAwayGoals)
secondHalfHomeConc = int(awayGoals) - int(firstHalfAwayGoals)
secondHalfTotalGoals = matchGoals - firstHalfTotalGoals
homeTeamContainers = soup.findAll('div', attrs = {'class' : 'container left'})
homeTeamStarting = homeTeamContainers[2]
homeTeamBench = homeTeamContainers[3]
homeTeamYellows = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
homeTeamReds = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
homeTeamCards = homeTeamYellows + homeTeamReds
awayTeamContainers = soup.findAll('div', attrs = {'class' : 'container right'})
awayTeamStarting = awayTeamContainers[2]
awayTeamBench = awayTeamContainers[3]
awayTeamYellows = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
awayTeamReds = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
awayTeamCards = awayTeamYellows + awayTeamReds
matchCards = homeTeamCards + awayTeamCards
try:
iframe = soup.findAll('iframe')
iframeSrc = iframe[1]['src']
url = 'https://us.soccerway.com/' + iframeSrc
c = requests.get(url,timeout = 10)
soupC = BeautifulSoup(c.content, "html.parser")
cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend left value'})
homeCorners = cornerContainer[0].text.strip()
awayCornersConc = homeCorners
cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend right value'})
awayCorners = cornerContainer[0].text.strip()
homeCornersConc = awayCorners
matchCorners = int(homeCorners) + int(awayCorners)
print("Got Score . " + homeTeam + " vs " + awayTeam+" . " + gameWeek )
statsA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + homeCorners + "," + awayCorners + "," + homeCornersConc + "," + awayCornersConc + "," + str(matchCorners)+","+dds[0].text.strip() + "\n")
return None
except Exception as e:
print("Got Score no corners. " + homeTeam + " vs " + awayTeam+" . " + gameWeek + " NO FRAME")
statsA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + "" + "," + "" + "," + "" + "," + "" + "," + ""+","+dds[0].text.strip() + "\n")
return None
else:
fixturesA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + date + "\n")
linksA.append(url + "\n")
print(homeTeam + " vs " + awayTeam + " at " + middle + " GW:" + gameWeek)
return None
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
linksA.append(url + "\n")
print(url)
return None
stats = open('Statsv2.csv','a',encoding='utf-8')
fixtures = open('fixturesv2.csv','w',encoding='utf-8')
with open('links.txt') as f:
content = f.readlines()
content = [x.strip() for x in content]
links = open('links.txt','w')
if __name__ == '__main__':
start_time = time.time()
p = Pool(20) # Pool tells how many at a time
records = p.map(parse, content)
p.terminate()
p.join()
print("--- %s seconds ---" % (time.time() - start_time))
I assume you are running Windows? Then the answer is that multi-processing in Windows creates copies instead of forks. So you have your main process with the lists and you get your working processes (from pool) with their own separate set of lists.
The workers most likely fill their list correctly, but the lists in the main-process don't get any data and so are staying empty. And the workers do not return anything. So, as you write your files in the main-process, you get empty files.
An easy way to solve this is creating pipes or queues between the main process and the workers to allow communication between the threads. You could also use shared arrays like they are provided by the multiprocessing class, but than you would need to know the length during creation.
see documentation: Multiprocessing
as pointed out by #RaJa you're not actually doing anything the parent/controlling process can see. the easiest is just to return values from the mapped function
for example, parse() could return tuple at the end like:
def parse(url):
# do work
return url, homeTeam, awayTeam, gameWeek, homeGoals, awayGoals # ...
then the parent process can receive the values and do useful things like saving them to a CSV file:
import csv
with Pool(20) as pool:
records = pool.map(parse, content)
with open('stats.csv', 'w') as fd:
out = csv.writer(fd)
out.writerow([
'url', 'hometeam', 'awayteam',
# and the remaining column names for the header
])
out.writerows(records)
This script was created by an ex-lab member that was quite a bit more adapt at Python scripting than I am.
I am attempting to find Cooccupancy between annotated peaks in "exon" regions of the entire human h19 genome. However, after trying to get this to run for about an hour I am looking for help.
Here is the script:
#!/usr/bin/python
import math
import sys
import re
import csv
import MySQLdb
import itertools
import argparse
# format for execution: ./findCooccupancy.py <loci file> <comma separated list of marks to check> <window size> <outputfile>
# example: ./findCooccupancy.py AllGenes.txt PolII-ChIP,KAP1-ChIP,Hexim 150 output.txt
# format of loci file:
# chr2 12345678 12345900 GENEA 1 +
# chr4 987654321 98765000 GENEB 1 -
# etc...
locifile = sys.argv[1]
marks = sys.argv[2]
window = int(sys.argv[3])
outputfile = sys.argv[4]
loci = list(csv.reader(open(locifile, 'rb'),delimiter='\t'))
#loci = list(itertools.chain.from_iterable(loci))
db = MySQLdb.connect(host="localhost",user="snrnp",passwd="snrnp",db="snrnp")
cur = db.cursor()
cntdict = {}
for mark in marks.split(","):
cntdict[mark] = []
counter = 1
for locus in loci:
print "Working on line# " + str(counter)
counter += 1
if str(locus[5]) == "+":
exon = locus[1]
else:
exon = locus[2]
for mark in marks.split(","):
# this is incredibly dirty. sorry. I don't have time to do this better
if mark == 'PolII-ChIP':
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")")
#print "select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")"
else:
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and ((chr_start < " + str(exon) + " and chr_end > " + str(exon) + ") or (abs(chr_start - " + str(exon) + ") < " + str(window) + ") or (abs(chr_end - " + str(exon) + ") < " + str(window) + "))")
#print "select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and ((chr_start < " + str(exon) + " and chr_end > " + str(exon) + ") or (abs(chr_start - " + str(exon) + ") < " + str(window) + ") or (abs(chr_end - " + str(exon) + ") < " + str(window) + "))"
cnt = cur.fetchone()[0]
if cnt > 0:
cntdict[mark].append(",".join(locus))
convertedlist = []
for key in cntdict.keys():
convertedlist.append(cntdict[key])
intersectlist = set(convertedlist[0]).intersection(*convertedlist[1:])
for key in cntdict.keys():
print str(key) + " hits: " + str(len(cntdict[key]))
print "\nTotal Intersection Count: " + str(len(intersectlist))
with open(outputfile, 'w') as outputwriter:
for line in intersectlist:
outputwriter.write(line + "\n")
This is the command line that I have been using:
./findCooccupancy.py ~/code/snRNP/analysis/from\ sequencing/KEC_Project/Pol-IIAnnotatedPeaksGenome.txt PolII-ChIP 150 KECExonOccupancy.txt
This is the latest error message I have received:
Working on line# 1
Traceback (most recent call last):
File "./findCooccupancy.py", line 41, in <module>
cur.execute("select count(*) from CHIP_PEAK where mark = '" + str(mark) + "' and chr = '" + str(locus[0]) + "' and (abs(summit - " + str(exon) + ") < " + str(window) + ")")
File "/Library/Python/2.7/site-packages/MySQLdb/cursors.py", line 205, in execute
self.errorhandler(self, exc, value)
File "/Library/Python/2.7/site-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
_mysql_exceptions.OperationalError: (1054, "Unknown column 'Start' in 'where clause'")