Can I use bisect to print the content of a line? - python

I have a file where each line is ordered alphabetically. The file is 12Gb, which means I can't simply read it line by line. The data looks like this:
brown 0 1 0 1 2
fox 3 5 0 0 1
jumped 2 0 6 1 0
The words at the beginning of each line are unique. The word and the numbers on each line are separated by tabs. I want to be able to query the file for specific keywords. For example, if I query "fox", the program should return "fox 3 5 0 0 1".
It seems that a good candidate for this would be the bisect module: https://docs.python.org/3.0/library/bisect.html
I found a post which uses bisect to find out the line number of a keyword: How do I perform binary search on a text file to search a keyword in python?
This is what the code looks like:
import bisect
import os
class Query(object):
def __init__(self, query, index=5):
self.query = query
self.index = index
def __lt__(self, comparable):
return self.query < comparable[self.index:]
class FileSearcher(object):
def __init__(self, file_pointer, record_size=35):
self.file_pointer = file_pointer
self.file_pointer.seek(0, os.SEEK_END)
self.record_size = record_size + len(os.linesep)
self.num_bytes = self.file_pointer.tell()
self.file_size = (self.num_bytes // self.record_size)
def __len__(self):
return self.file_size
def __getitem__(self, item):
self.file_pointer.seek(item * self.record_size)
return self.file_pointer.read(self.record_size)
with open('myfile') as file_to_search:
query = 'fox\t' #token to query
wrapped_query = Query(query)
searchable_file = FileSearcher(file_to_search)
linepos = bisect.bisect(searchable_file, wrapped_query)
print "Located # line: ", linepos
#print content of line?
However, I can't figure out how to actually print the content of the line. I should at least add a read statement somewhere, but I don't know where.
Is it possible to print the content of the line with the bisect module?

If you want go with Python solution, you can do the following:
Read file by small chunks of MAX_LINE bytes, each time moving forward by fixed offset
That offset determines block size
For each such read, determine the key (first word in a line)
These keys serve as delimiters of blocks
Construct the list of such keys. The list would be sorted as keys are ordered
You may persist such list somewhere via pickle/json.dumps/...
When quering, find via bisect the index of a block where you key is located
Read that block entirely and find the key with data
Here is the example file bigfile:
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7
The code:
import os
from bisect import bisect
MAX_LINE = 7
BLOCK_SIZE = 10
def parse_chunks(filename):
size = os.path.getsize(filename)
chunks = []
with open(filename, 'rb') as file:
block = str(file.read(MAX_LINE*2))
first_line = block[:block.find('\n') + 1]
chunks.append(first_line.split()[0])
pos = BLOCK_SIZE
while pos < size:
file.seek(pos)
block = str(file.read(MAX_LINE*2))
first_eol = block.find('\n')
second_eol = block.find('\n', first_eol + 1)
if first_eol == -1 or second_eol == -1:
break
line = block[first_eol + 1:second_eol]
key = line.split()[0]
chunks.append(key)
pos += BLOCK_SIZE
return chunks
if __name__ == '__main__':
BLOCK_SIZE = 10
filename = 'bigfile'
chunks = parse_chunks(filename)
query = 'abc'
pos_before = bisect(chunks, query) - 1
with open(filename, 'rb') as file:
file.seek(pos_before*BLOCK_SIZE)
block = str(file.read(BLOCK_SIZE + MAX_LINE))
line_start = block.find(query)
line_end = block.find('\n', line_start + 1)
line = block[line_start:line_end]
print(line)
In this toy example I use block size of 10 bytes, in your case of 12GB file I'd suggest you to start with 1M.

The following recursive function should be able to narrow the search interval. I'm not sure that you can modify it so that it returns a match or None for no match.
def bisearch(f, word, i, j)
if (j-1)<1E6: return i,j
k = (i+j)/2
f.seek(k)
while k<j:
c = f.read(1)
k = k+1
if c == '\n': break
else:
# ??? no match ??? I'm not sure
w = []
while 1:
c = f.read(1)
if c == '\t': break
w.append(c)
w = "".join(w)
if w == word:
return k, k
if w < word:
return bisearch(f, word, k, j)
else:
return bisearch(f, word, i, k)
and here an example of usage
word = ...
f = open(...)
i,j = bisearch(f, word, 0, len_f)
f.seek(i)
if i==j:
line = f.readline()
else:
#################### EDIT ################
# OLD
# buffer = f.read(1E6)
# NEW
buffer = f.read(j-i)
lenw = len(word)
for line in buffer.split('\n'):
if line[:lenw] == word: break
else:
# no matches, SOS
result = process(line)

Try seeking to the line in question and using readline.
print "Located # line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()
This is assuming linepos is the position of the line, counted in bytes from the beginning of the file. If it's the position counted in line numbers, you'll need to multiply by the number of bytes per line before seeking.
print "Located # line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()

Related

Parsing a total.txt file by keywords in it

I'm having trouble parsing a file. I have code that parses a file by the word Total: if its value is greater than 20.0 and returns the data. I need to change the search keyword to Tokens eth: with a value greater than 20.0 and output all data between separators ======== and additionally write all sorted values into sort.txt file. I would be grateful for professional help)
Code:
outlist = []
flag = False
def dump(list_, flag_):
if list_ and flag_:
print('\n'.join(list_))
return [], False
with open('total.txt') as file:
for line in map(str.strip, file):
if line.startswith('='):
outlist, flag = dump(outlist, flag)
else:
tokens = line.split()
if len(tokens) == 3 and tokens[1] == 'Total:':
try:
flag = float(tokens[2][:-1]) > 20.0
except ValueError:
pass
outlist.append(line)
dump(outlist, flag)
total.txt
============
| hafuia
| 0xb34a47885262f9d8673dc77de7b583961134f09fb03620b29d282c32ee6932be
| 0xD0b2612a6eE3111114b43b25322C6F08A251D38D
| Total: 47.62874464666479$
|
|
| Tokens eth:
| 20.608732$ MANA
|
| Protocols cro:
| 17.840052$ VVS Finance
| 8.953779$ V3S Finance
============
| asdf
| 0x72e164aa187feaff7cb28a74b7ff800a0dfe916594c70f141069669e9df5a23b
| 0xC7dFe558ed09F0f3b72eBb0A04e9d4e99af0bd0D
| Total: 22.908481672796988$
|
|
| Tokens eth:
| 22.376087$ SOS
============
| asdf
| 0xbce666bca3c862a2ee44651374f95aca677de16b4922c6d5e7d922cc0ac42a3d
| 0x5870923a244f52fF2D119fbf5525421E32EC006e
| Total: 9.077030269778557$
|
|
| Tokens eth:
| 8.942218$ SOS
============
This is how you can parse the file.
def parse_output(filename):
outlist = []
with open(filename) as file:
new_block = False
to_write = False
lines_arr = []
for line in map(str.strip, file):
if line.startswith('======='):
new_block = not new_block
if new_block:
if to_write:
outlist.append(lines_arr)
lines_arr = []
new_block = False
to_write = False
else:
lines_arr.append(line)
if 'Total:' in line:
num = float(line.split()[-1][:-1])
if num > 20:
to_write = True
return outlist
def write_output(outlist, filename):
for block in outlist:
for line in block:
with open(filename, 'a') as out_file:
out_file.write(line + '\n')
with open(filename, 'a') as out_file:
out_file.write('=======' + '\n')
if __name__ == '__main__':
write_output(parse_output('total.txt'), 'output.txt')
I missed the sorted wallet thing. For sorting, while appending array to outlist, you can use another array for order, or prepend the number to array, sort the outputs, and skip first element while writing.
This is written in such a way that it's easy to get fe. the addresses as well. sorting done with a simple lambda function.
from pprint import pprint
wallet_splitter = "============"
wallet_content_start = "Tokens eth:"
wallet_line_start = "|"
with open("totals.txt") as infile:
wallets = infile.read().split(wallet_splitter)
print(wallets)
wallets_above_20 = []
for wallet in wallets:
total = 0
separate = []
contents = False
for line in wallet.splitlines():
if wallet_content_start in line:
contents = True
elif contents:
if "$" in line:
separate.append(line.replace(wallet_line_start, "").split("$")[0])
total += float(separate[-1])
else:
contents = False
for amount in separate:
if float(amount) > 20:
wallets_above_20.append({
"total": total,
"data": wallet
})
pprint(sorted(wallets_above_20, key = lambda i: i['total'],reverse=True))
This is another simple extensible approach you can use to achieve what you need. The comments will explain the code.
# Create a simple representational object with data for every record.
class RateObject:
# You can change the delimiter to whatever you want.
def __init__(self, text_lines: list, delimiter="Tokens eth:"):
self.text_lines = text_lines
index = [i for i, x in enumerate(text_lines) if delimiter in x][0]
# Get the value from delimiter line
self.value = self._get_value(index)
# Override this method, to change the way you extract the value. From same line or different line etc.
def _get_value(self, delimiter_index: int):
# Case of Tokens eth:
value = self.text_lines[delimiter_index + 1]
value = value.strip()
# A bad parsing for numbers, can be improved may be!
number = "".join([x for x in value if x.isdigit() or x == "."])
if number:
return float(number)
else:
# Assume 0 for unknown values
return 0.0
def __str__(self):
# Return the lines as it is
return "".join(self.text_lines)
def __repr__(self):
return "".join(self.text_lines)
# read the source file
with open("src.txt", "r") as src:
line_texts = src.readlines()
# Split the lines into sections, using the delimiter ========
splitters = [index for index, text in enumerate(line_texts) if text == "============\n"]
# Create a list of RateObjects
raw_objects = [RateObject(lt) for lt in [line_texts[splitters[i]:splitters[i + 1]] for i in range(len(splitters) - 1)]]
# Filter the objects, to get only the ones with value > 20
selected_objects = list(filter(lambda x: x.value > 20.0, raw_objects))
# Sort the objects by value
sorted_objects = sorted(selected_objects, key=lambda x: x.value, reverse=True)
# print(selected_objects)
# print(sorted_objects)
# Write the sorted objects to a file
with open("sorted.txt", "w") as dst:
dst.write("\n".join([str(x) for x in sorted_objects]))
Here's a simple generator-based approach.
def items(file):
"""
Generator to yield items from filename
whose "Tokens eth:" is above 20.0
"""
with open(file) as lines:
item = []
tokens = 0
capture = False
for line in lines:
if line == "============\n":
if tokens > 20.0:
yield tokens, item
item = []
tokens = 0
continue
if capture:
tokens = float(line.strip().split()[-2].rstrip("$"))
capture = False
if line.startswith("| Tokens eth:"):
# Set flag to capture next line when we get to it
capture = True
item.append(line)
def main():
import sys
print("============")
for tokens, item in sorted(list(items(sys.argv[1]))):
print("".join(item), end="")
print("============")
if __name__ == "__main__":
main()
For simplicity, I made the generator also perform filtering, though it would be easy to remove items with a lower total on the caller's side if you wanted to make this reusable.
Demo: https://ideone.com/UKuC6C
In fact, I would recommend that you parse this haphazard file format just once, and convert it to a standard format like CSV or JSON for further processing if this is more than a one-off.
Using regular expressions from the re module of the standard library you can, for example, split the text into blocks enclosed by the separator, then find the amount of eth in each block, sort and finally filter them.
# parameters
total_txt = """from question"""
sorted_file_name = 'sort.txt'
THRESHOLD = 20.
as_dicreasing_order = False
# body
separators = re.finditer('='*12, total_txt)
separators = list(separators)
blocks = map(total_txt.__getitem__, [slice(m1.start(), m2.start()) for m1, m2 in zip(separators, separators[1:])])
amount_block_pairs = [(float(re.search(r'Tokens eth:\n\| (\d*\.\d*)\$', block, re.M).group(1)), block) for block in blocks]
# reverse=False for increasing order, True for the opposite
sorted_blocks = sorted(amount_block_pairs, reverse=as_dicreasing_order)
filtered_blocks = [block for amount, block in sorted_blocks if amount >= THRESHOLD]
with open(sorted_file_name, 'w') as fd:
fd.write(''.join(filtered_blocks))
One another option is to use python ttp template to parse your data. In the following code, it checks your total values, finds out the value lower than 20.0. Then, the code asks a value to enter which will replace with the Tokens eth: which is lower than 20.
from ttp import ttp
import json
with open('total.txt') as f:
data_to_parse = f.read()
ttp_template = '''
| Total: {{total}}$
| {{tokens_eth}}$ {{ignore}}
'''
parser = ttp(data=data_to_parse, template=ttp_template)
parser.parse()
# print result in JSON format
results = parser.result(format='json')[0]
#print(results)
#converting str to json.
result = json.loads(results)
# print(result)
for i in result[0]:
# print(i)
if float(i['total']) < 20:
new_tokens_eth = float(input(f"Total value is {i['total']} lower than 20. Enter a new 'Tokens eth:' value: "))
if i['tokens_eth'] in data_to_parse:
data_to_parse = data_to_parse.replace(i['tokens_eth'], str(new_tokens_eth))
print(data_to_parse)
See the parsed data:
See the output after the code is run.

obfuscation of a text file using python - by reversing the words and inserting a specific number of random characters between them

Beginner Coding problem I am supposed to write a code that reverses the contents of a file and then inserts a number of random characters based on a strength the user chooses. It then creates a new file containing the obstructed file.
For example, if the user chooses strength = 2, it will insert 2 random characters between each letter in the text file: The cat sits ---> sgyt6gilns t7faxdc e3dh1kT
Right now my program inserts too many characters in between and I can't figure out why.
This is what it's doing:
input: CAT
Output of strength = 1: TeAEADQoC
import string
import random
def getRandomChar():
alpha = string.ascii_letters + string.digits
return random.choice(alpha)
def randomString(EncrypStrength):
count = 0
result = ''
while count < len(EncrypStrength):
result += getRandomChar()
count += 1
return result
def ReverseString(OrigFile):
return OrigFile[::-1]
def LineEncrypt(line, EncrypStrength):
EncrypStrength = ReverseString(line)
index = 0
newline = EncrypStrength[index]
index += 1
while index < len(EncrypStrength):
newline += randomString(EncrypStrength)
newline += EncrypStrength[index]
index += 1
return newline
def main():
OrigFile =input('Original File Name:')
EncryptedFile = input("obfuscated File Name:")
EncrypStrength = int(input('Enter the Encryption Strength:'))
Orig = open(OrigFile, 'r')
Encrypted = open(EncryptedFile, 'w')
line = Orig.readline()
while line!= '':
encryptLine = LineEncrypt(line, EncrypStrength)
Encrypted.write(encryptLine +"\n")
line = Orig.readline()
Orig.close()
Encrypted.close()
if __name__=="__main__":
main()
In Line Encrypt method you are using incorrectly Encrypt Strength, you are overriding the number of characters to put as EncryptStrength with reversed line.
def LineEncrypt(line, EncrypStrength):
reversedString = ReverseString(line)
index = 0
newline = reversedString[index]
index += 1
while index < len(reversedString):
newline += randomString(EncrypStrength)
newline += reversedString[index]
index += 1
You are confusing EncrypStrength and overriding it as Ritesh mentioned.
Here is the full corrected code, I hope it will work as you expected.
import string
import random
def getRandomChar():
alpha = string.ascii_letters + string.digits
return random.choice(alpha)
def randomString(EncrypStrength):
count = 0
result = ''
while count < EncrypStrength:
result += getRandomChar()
count += 1
return result
def ReverseString(OrigFile):
return OrigFile[::-1]
def LineEncrypt(line, EncrypStrength):
RevStr = ReverseString(line)
index = 0
newline = RevStr[index]
index += 1
while index < len(RevStr):
newline += randomString(EncrypStrength)
newline += RevStr[index]
index += 1
return newline
def main():
OrigFile =input('Original File Name:')
EncryptedFile = input("obfuscated File Name:")
EncrypStrength = int(input('Enter the Encryption Strength:'))
Orig = open(OrigFile, 'r')
Encrypted = open(EncryptedFile, 'w')
line = Orig.readline()
while line!= '':
encryptLine = LineEncrypt(line, EncrypStrength)
Encrypted.write(encryptLine +"\n")
line = Orig.readline()
Orig.close()
Encrypted.close()
if __name__=="__main__":
main()

Python: Rosalind Consensus and Profile

I am trying to solve the "Consensus and Profile" challenge on Rosalind.
The challenge instructions are as follows:
Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.
Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)
My code is as follows (I got most of it from another user on this website). My only issue is that some of the DNA strands are broken down into multiple separate lines, so they are being appended to the "allstrings" list as separate strings. I am trying to figure out how to write each consecutive line that does not contain ">" as a single string.
import numpy as np
seq = []
allstrings = []
temp_seq = []
matrix = []
C = []
G = []
T = []
A = []
P = []
consensus = []
position = 1
file = open("C:/Users/knigh/Documents/rosalind_cons (3).txt", "r")
conout = open("C:/Users/knigh/Documents/consensus.txt", "w")
# Right now, this is reading and writing each as an individual line. Thus, it
# is splitting each sequence into multiple small sequences. You need to figure
# out how to read this in FASTA format to prevent this from occurring
desc = file.readlines()
for line in desc:
allstrings.append(line)
for string in range(1, len(allstrings)):
if ">" not in allstrings[string]:
temp_seq.append(allstrings[string])
else:
seq.insert(position, temp_seq[0])
temp_seq = []
position += 1
# This last insertion into the sequence must be performed after the loop to empty
# out the last remaining string from temp_seq
seq.insert(position, temp_seq[0])
for base in seq:
matrix.append([pos for pos in base])
M = np.array(matrix).reshape(len(seq), len(seq[0]))
for base in range(len(seq[0])):
A_count = 0
C_count = 0
G_count = 0
T_count = 0
for pos in M[:, base]:
if pos == "A":
A_count += 1
elif pos == "C":
C_count += 1
elif pos == "G":
G_count += 1
elif pos == "T":
T_count += 1
A.append(A_count)
C.append(C_count)
G.append(G_count)
T.append(T_count)
profile_matrix = {"A": A, "C": C, "G": G, "T": T}
P.append(A)
P.append(C)
P.append(G)
P.append(T)
profile = np.array(P).reshape(4, len(A))
for pos in range(len(A)):
if max(profile[:, pos]) == profile[0, pos]:
consensus.append("A")
elif max(profile[:, pos]) == profile[1, pos]:
consensus.append("C")
elif max(profile[:, pos]) == profile[2, pos]:
consensus.append("G")
elif max(profile[:, pos]) == profile[3, pos]:
consensus.append("T")
conout.write("".join(consensus) + "\n")
for k, v in profile_matrix.items():
conout.write(k + ": " + " ".join(str(x) for x in v) + "\n")
conout.close()
There are a couple of ways that you can iterate a FASTA file as records. You can use a prebuilt library or write your own.
A widely used library for working with sequence data is biopython. This code snippet will create a list of strings.
from Bio import SeqIO
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for record in SeqIO.parse(file_handle, "fasta"):
sequences.append(record.seq)
Alternatively, you can write your own FASTA parser. Something like this should work:
def read_fasta(fh):
# Iterate to get first FASTA header
for line in fh:
if line.startswith(">"):
name = line[1:].strip()
break
# This list will hold the sequence lines
fa_lines = []
# Now iterate to find the get multiline fasta
for line in fh:
if line.startswith(">"):
# When in this block we have reached
#  the next FASTA record
# yield the previous record's name and
# sequence as tuple that we can unpack
yield name, "".join(fa_lines)
# Reset the sequence lines and save the
#  name of the next record
fa_lines = []
name = line[1:].strip()
# skip to next line
continue
fa_lines.append(line.strip())
yield name, "".join(fa_lines)
You can use this function like so:
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for name, seq in read_fasta(file_handle):
sequences.append(seq)

How can I fix this error: "NameError: global name 'filtered_text' is not defined"; in the code below?

import re
from sys import argv
def read_file(fname):
""" open and extract the text from the file """
txt_file = open(fname, 'r')
txt = txt_file.read()
txt_file.close()
return txt
def clean_space(files):
""" remove spaces from the file """
return files.replace('\n', '')
def filter_file(files):
""" remove punctuation and filter small words from the file """
split_words = map(lambda x: re.sub('[^A-Za-z0-9]+', '', x),
files.split())
filtered_txt = [x for x in split_words if len(x) > 1]
return filtered_txt
def dict_count(files):
""" for loop to return dict with word count and length keys """
lengths = {}
for word in filtered_text: # And this also
length = len(word)
if length not in lengths:
lengths[length] = 0
lengths[length] += 1
for length, counter in lengths.item():
return "Words of length %d: %d" % (length, counter)
def print_result(fname):
fi = dict_count(filter_file(clean_space(read_file(fname))))
print fi
if __name__ == '__main__':
script, fname = argv
print_result(fname)
In the function dict_count you have never created the filtered_text variable, and then you want to use it.
You must create the variable before using it with:
filtered_text = filter_file(files)
complete code:
def dict_count(files):
""" for loop to return dict with word count and length keys """
lengths = {}
filtered_text = filter_file(files)
for word in filtered_text: # And this also
length = len(word)
if length not in lengths:
lengths[length] = 0
lengths[length] += 1
for length, counter in lengths.item():
return "Words of length %d: %d" % (length, counter)

Printing values following comparison of two csv files only if in a specific range using Python 3.3

I'm new at programming and I've got two CSV files that I'm trying to compare. The first file, snp.csv is shown below:
chrom position ref var gene var
1 21421 G T WASH7P snp.LOH
1 1251593 T C CPSF3L snp.somatic
6 107474777 - A PDSS2 indel.somatic
14 106586168 G T ADAM6 snp.LOH
The second file, quad.csv is shown below:
chrom Start End Sequence
1 21420 21437 GGGACGGGGAGGGTTGGG
1 23058 23078 GGGCTGGGGCGGGGGGAGGG
1 23515 23534 GGGAAGGGACAGGGCAGGG
1 45098 45118 GGGAAAGGGCAGGGCCCGGG
3 1148 1173 GGGCCGGGCAAGGCCGGGTGCAGGG
I want to compare these two files and if the two chrom values match, I want to print only those having position value (in snp.csv file) in the range of the start and end value (in the quad.csv file).
So, I am looking for a solution that will give me something like the following (basically the snp.csv file with start, end and sequence value of the quad.csv file)
chrom position ref var gene var Start End Sequence
1 21421 G T WASH7P snp.LOH 21420 21437 GGGACGGGGAGGGTTGGG
I've searched the posts and found some interesting answers that helped me a lot but I’m still experiencing some issues. I’m still learning Python…
Here is my script up to now, I know I have a problem with the range function...I'm stuck
import csv
snp_file = open("snp.csv", "r")
quad_file = open("quad.csv", "r")
out_file = open("results.csv", "wb")
snp = csv.reader(snp_file, delimiter='\t')
quad = csv.reader(quad_file, delimiter='\t')
out = csv.reader(out_file, delimiter='\t')
quadlist = [row for row in quad]
for snp_row in snp:
row = 1
found = False
for quad_row in quadlist:
results_row = snp_row
if snp_row[0] == quad_row[0]:
quad_pos = range(quad_row[1], quad_row[2])
if snp_row[1] in quad_pos:
results_row.append(quad_row)
found = True
break
row = row + 1
if not found:
pass
print (results_row)
snp.close()
quad.close()
out.close()
from bisect import bisect_right
from collections import defaultdict
import csv
TOO_HIGH = 2147483647 # higher than any actual gene position
SNP_FMT = "{0:<7} {1:<11} {2:3} {3:3} {4:11} {5:15}".format
QUAD_FMT = " {1:<7} {2:<7} {3}".format
def line_to_quad(line):
row = line.split()
return int(row[0]), int(row[1]), int(row[2]), row[3]
def line_to_snp(line):
row = line.split()
return int(row[0]), int(row[1]), row[2], row[3], row[4], row[5]
class Quads:
#classmethod
def from_file(cls, fname):
with open(fname, "rU") as inf:
next(inf, None) # skip header line
quads = (line_to_quad(line) for line in inf)
return cls(quads)
def __init__(self, rows):
self.chromosomes = defaultdict(list)
for row in rows:
self.chromosomes[row[0]].append(row[1:])
for segs in self.chromosomes.values():
segs.sort()
def find_match(self, chromosome, position):
segs = self.chromosomes[chromosome]
index = bisect_right(segs, (position, TOO_HIGH, "")) - 1
try:
seg = segs[index]
if seg[0] <= position <= seg[1]:
return (chromosome,) + seg
except IndexError:
pass
def main():
quads = Quads.from_file("quad.csv")
print( # header
SNP_FMT("chrom", "position", "ref", "var", "gene", "var") +
QUAD_FMT("chrom", "Start", "End", "Sequence")
)
with open("snp.csv") as inf:
next(inf, None) # skip header line
for line in inf:
snp = line_to_snp(line)
quad = quads.find_match(snp[0], snp[1])
if quad:
print(SNP_FMT(*snp) + QUAD_FMT(*quad))
if __name__=="__main__":
main()
which gives
chrom position ref var gene var Start End Sequence
1 21421 G T WASH7P snp.LOH 21420 21437 GGGACGGGGAGGGTTGGG

Categories

Resources