Avoiding repetition of if statement - python

I prepared functions and sorted data for this task:
(it's actually AoC day 4, but quick explanation to make it clear)
I have already sorted data to this 'structure'
byr:1991
eyr:2022
hcl:#341e13
iyr:2016
pid:729933757
hgt:167cm
ecl:gry
hcl:231d64
cid:124
ecl:gmt
eyr:2039
hgt:189in
pid:#9c3ea1
ecl:#1f58f9
pid:#758e59
iyr:2022
hcl:z
byr:2016
hgt:68
eyr:1933
[and so on +250 packages(by package I mean set of byr,ecl,eyr... separated by new line).]
and prepared this code:
def check_fields(list):
comparison_list = ['byr', 'iyr', 'eyr',
'hgt', 'hcl', 'ecl',
'pid']
statement = True
for i in comparison_list:
statement = statement and (i in list)
return statement
def check_byr_iyr_eyr(line):
prefix,value = line.split(':')
cases = {'byr':{'min':1920, 'max':2002},
'iyr':{'min':2010, 'max':2020},
'eyr':{'min':2020, 'max':2030} }
return cases[prefix]['min'] <= int(value) <= cases[prefix]['max']
def check_hgt(line):
unit = line[len(line)-2] + line[len(line)-1]
value = line[line.index(':')+1: -2]
cases = {'cm':{'min':150, 'max':193},
'in':{'min':59, 'max':76}}
return cases[unit]['min'] <= int(value) <= cases[unit]['max']
def check_hcl(line):
statement = True
if line[line.index(':')+1] != '#' or len(line[line.index(':')+2:]) != 6:
return False
else:
string = line[line.index('#')+1:]
for i in string:
statement = statement and (97 <= ord(i) <= 102 or 48 <= ord(i) <= 57)
return statement
def check_ecl(line):
comparison_list = ['amb', 'blu', 'brn',
'gry', 'grn', 'hzl',
'oth' ]
if line[line.index(':') + 1:] in comparison_list:
return True
return False
def check_pid(line):
if len(line[line.index(':')+1:]) != 9:
return False
try:
int(line[line.index(':')+1:])
return True
except:
return False
line_list = []
valid_passports = 0
with open('results.txt', 'r') as f:
for line in f:
if line != '\n':
''' add line to line_list'''
pass
else:
'''
check lines from line_list
using above declared functions
if every line is ok:
valid_passports +=1
'''
I have to check if every package contains every key except of cid, and then check if every value for each key is proper.
byr (Birth Year) - four digits; at least 1920 and at most 2002.
iyr (Issue Year) - four digits; at least 2010 and at most 2020.
eyr (Expiration Year) - four digits; at least 2020 and at most 2030.
hgt (Height) - a number followed by either cm or in:
If cm, the number must be at least 150 and at most 193.
If in, the number must be at least 59 and at most 76.
hcl (Hair Color) - a # followed by exactly six characters 0-9 or a-f.
ecl (Eye Color) - exactly one of: amb blu brn gry grn hzl oth.
pid (Passport ID) - a nine-digit number, including leading zeroes.
cid (Country ID) - ignored, missing or not.
(above mentioned rules are ensured by earlier declared functions)
And the question/problem is How can I avoid repetition of if statement during checking every line added to line list(it refers to part with multi-line comment with "pseudo code") ? - I mean I could do it like
if line[0:3] == "byr":
check_byr(line)
# and so on, many if statement checking the first 3 letters to adjust proper function to use
but it doesn't seem like proper and elegant solution, matybe you could give me hints how to deal with that, or give another idea to solve that problem in different way that I didn't use.
Please help, thanks.

Can't you have a mapping from prefix to target function?
Something like
line = # ...
prefix = # ... either "hgt" or "pid" or other
def check_hgt(line):
pass
def check_pid(line):
pass
# ... other checker functions
checker_functions_pool = {"hgt": check_hgt, "pid": check_pid}
checker_function = checker_functions_pool[prefix]
checker_function(line)

#viGor207, this is anther way to approach it: (part 2 example):
import re
passports = [
dict(
line.split(':')
for line
in pas.split()
)
for pas
in open('input').read().split('\n\n')
]
required = {'byr', 'iyr', 'eyr', 'hgt', 'hcl', 'ecl', 'pid'}
def valid(pas):
return bool(
1920 <= int(pas['byr']) <= 2002 and
2010 <= int(pas['iyr']) <= 2020 <= int(pas['eyr']) <= 2030 and
re.fullmatch(r'[0-9]{2,3}(cm|in)', pas['hgt']) and
(
(pas['hgt'][-2:] == 'cm' and 150 <= int(pas['hgt'][:-2]) <= 193) or
(pas['hgt'][-2:] == 'in' and 59 <= int(pas['hgt'][:-2]) <= 79)
) and
re.fullmatch(r'#[0-9a-f]{6}', pas['hcl']) and
pas['ecl'] in {'amb', 'blu', 'brn', 'gry', 'grn', 'hzl', 'oth'} and
re.fullmatch(r'[0-9]{9}', pas['pid'])
)
print(
sum(
all(r in pas for r in required) and valid(pas)
for pas
in passports
)
)

To make it complete, here is the part one:
passports = [
dict(
line.split(':')
for line
in pas.split()
)
for pas
in open('input').read().split('\n\n')
]
required = {'byr', 'iyr', 'eyr', 'hgt', 'hcl', 'ecl', 'pid'}
print(
sum(
all(r in pas for r in required)
for pas in passports
)
)

The task states:
Each passport is represented as a sequence of key:value pairs
separated by spaces or newlines. Passports are separated by blank
lines.
a sequence of key:value pairs is screaming use a list of dicts in Python, but your method could be used.
You could use a dict that maps field names to the function that checks for that field line, field_to_checker . I took your example parsed inputs as a list of your parsed lines then added a checker for cid that just returns True, and created the following code snippet:
def check_cid(line):
return True
field_to_checker = {
'byr': check_byr_iyr_eyr,
'cid': check_cid,
'ecl': check_ecl,
'eyr': check_byr_iyr_eyr,
'hcl': check_hcl,
'hgt': check_hgt,
'iyr': check_byr_iyr_eyr,
'pid': check_pid,
}
line_data = """byr:1991
eyr:2022
hcl:#341e13
iyr:2016
pid:729933757
hgt:167cm
ecl:gry
hcl:231d64
cid:124
ecl:gmt
eyr:2039
hgt:189in
pid:#9c3ea1
ecl:#1f58f9
pid:#758e59
iyr:2022
hcl:z
byr:2016
hgt:68
eyr:1933""".split('\n')
valid_passports = 0
ok_passport = True # Accumulating over all fields of one passport
for line in line_data + ['\n']: # Add empty line to force processing last passport
line = line.rstrip()
if line: # Not blank line
if ok_passport: # One False value in a passport will prevail
key = line[:3]
ok_passport = (key in field_to_checker
and field_to_checker[key](line))
else: # Blank line, end of passport record
if ok_passport:
valid_passports += 1
ok_passports = True
In the for line in line_data + ['\n'] loop the count of valid_passports is only updated when there is a blank line ending a passport record. The last passport needs to have a blank line after it to be properly counted hence the addition of an extra blank line to the end of the line_data.
The above is untested, but should give you tips on how to extend what you have started with.

I would suggest placing the dictionary values in variables as early as possible to make the logic simpler to write and read.
That should allow you to make single line conditions that are ledgible:
data = \
"""byr:1991
eyr:2022
hcl:#341e13
iyr:2016
pid:729933757
hgt:167cm
ecl:gry
hcl:231d64
cid:124
ecl:gmt
eyr:2039
hgt:189in
pid:#9c3ea1
ecl:#1f58f9
pid:#758e59
iyr:2022
hcl:z
byr:2016
hgt:68
eyr:1933"""
...
# iterator to get packages
def getPackages(d):
package = dict()
for line in d:
if line:
field,value = line.split(":",1)
package[field]=value
else:
yield package.copy()
package.clear()
if package: yield package
fields = ['byr', 'iyr', 'eyr', 'hgt', 'hcl', 'ecl', 'pid']
for package in getPackages(data.split("\n")):
values = [package.get(f,"") for f in fields]
byr, iyr, eyr, hgt, hcl, ecl, pid = values
isValid = "" not in values[:-1] \
and int(byr) in range(1920,2001+1) \
and int(iyr) in range(2010,2020+1) \
and int(eyr) in range(2020,2030+1) \
and int(hgt[:-2]) in {"cm":range(150,193+1),"in":range(59,76+1)}.get(hgt[-2:],[]) \
and hcl.startswith("#") and len(hcl)==7 \
and all(97 <= ord(i) <= 102 or 48 <= ord(i) <= 57 for i in hcl[1:]) \
and ecl in {'amb', 'blu', 'brn', 'gry', 'grn', 'hzl', 'oth' } \
and (pid == "" or pid.isdigit() and len(pid)==9)
print(pid,isValid)
"""
729933757 True
#9c3ea1 False
#758e59 False
"""

Related

finding the same words in two files and leaving out not repeated ones in python

I have to write a program that correlates smoking with lung cancer risk. For that I have data in two files.
My code is computing the data given in the same lines (eg:America,23.3 with Spain,77.9 and
Italy,24.2 with Russia,60.8)
How to modify my code so that it computes the numbers of the same countries and leaves out the countries that occur only in one file (it shouldn't compute Germany, France, China, Korea because they are only in one file)
Thank you so much for your help in advance:)
smoking file:
Country, Percent Cigarette Smokers Data
America,23.3
Italy,24.2
Russia,23.7
France,14.9
England,17.9
Spain,17
Germany,21.7
second file:
Cases Lung Cancer per 100000
Spain,77.9
Russia,60.8
Korea,61.3
America,73.3
China,66.8
Vietnam,64.5
Italy,43.9
and my code:
def readFiles(smoking_datafile, cancer_datafile):
'''
Reads the data from the provided file objects smoking_datafile
and cancer_datafile. Returns a list of the data read from each
in a tuple of the form (smoking_datafile, cancer_datafile).
'''
# init
smoking_data = []
cancer_data = []
empty_str = ''
# read past file headers
smoking_datafile.readline()
cancer_datafile.readline()
# read data files
eof = False
while not eof:
# read line of data from each file
s_line = smoking_datafile.readline()
c_line = cancer_datafile.readline()
# check if at end-of-file of both files
if s_line == empty_str and c_line == empty_str:
eof = True
# check if end of smoking data file only
elif s_line == empty_str:
raise OSError('Unexpected end-of-file for smoking data file')
# check if at end of cancer data file only
elif c_line == empty_str:
raise OSError('Unexpected end-of-file for cancer data file')
# append line of data to each list
else:
smoking_data.append(s_line.strip().split(','))
cancer_data.append(c_line.strip().split(','))
# return list of data from each file
return (smoking_data, cancer_data)
def calculateCorrelation(smoking_data, cancer_data):
'''
Calculates and returns the correlation value for the data
provided in lists smoking_data and cancer_data
'''
# init
sum_smoking_vals = sum_cancer_vals = 0
sum_smoking_sqrd = sum_cancer_sqrd = 0
sum_products = 0
# calculate intermediate correlation values
num_values = len(smoking_data)
for k in range(0,num_values):
sum_smoking_vals = sum_smoking_vals + float(smoking_data[k][1])
sum_cancer_vals = sum_cancer_vals + float(cancer_data[k][1])
sum_smoking_sqrd = sum_smoking_sqrd + \
float(smoking_data[k][1]) ** 2
sum_cancer_sqrd = sum_cancer_sqrd + \
float(cancer_data[k][1]) ** 2
sum_products = sum_products + float(smoking_data[k][1]) * \
float(cancer_data[k][1])
# calculate and display correlation value
numer = (num_values * sum_products) - \
(sum_smoking_vals * sum_cancer_vals)
denom = math.sqrt(abs( \
((num_values * sum_smoking_sqrd) - (sum_smoking_vals ** 2)) * \
((num_values * sum_cancer_sqrd) - (sum_cancer_vals ** 2)) \
))
return numer / denom
Let's just focus on getting the data into a format that is easy to work with. The code below will get you a dictionary of the form ...
smokers_cancer_data = {
'America': {
'smokers': '23.3',
'cancer': '73.3'
},
'Italy': {
'smokers': '24.2',
'cancer': '43.9'
},
...
}
Once you have this you can get any values you need and perform your calculations. See the code below.
def read_data(filename: str) -> dict:
with open(filename, 'r') as file:
next(file) # Skip the header
data = dict();
for line in file:
cleaned_line = line.rstrip()
# Skip blank lines
if cleaned_line:
data_item = (cleaned_line.split(','))
data[data_item[0]] = float(data_item[1])
return data
# Load data into python dictionaries
smokers_data = read_data('smokersData.txt')
cancer_data = read_data('lungCancerData.txt')
# Build one dictionary that is easy to work with
smokers_cancer_data = dict()
for (key, value) in smokers_data.items():
if key in cancer_data:
smokers_cancer_data[key] = {
'smokers': smokers_data[key],
'cancer' : cancer_data[key]
}
print(smokers_cancer_data)
For example, if you want to calculate the sum of the smoker and cancer values.
smokers_total = 0
cancer_total = 0
for (key, value) in smokers_cancer_data.items():
smokers_total += value['smokers']
cancer_total += value['cancer']
This will return a list of all the countries that have datas, along with the data:
l3 = []
with open('smoking.txt','r') as f1, open('cancer.txt','r') as f2:
l1, l2 = f1.readlines(), f2.readlines()
for s1 in l1:
for s2 in l2:
if s1.split(',')[0] == s2.split(',')[0]:
cty = s1.split(',')[0]
smk = s1.split(',')[1].strip()
cnr = s2.split(',')[1].strip()
l3.append(f"{cty}: smoking: {smk}, cancer: {cnr}")
print(l3)
Output:
['Spain: smoking: 77.9, cancer: 17', 'Russia: smoking: 60.8, cancer: 23.7', 'America: smoking: 73.3, cancer: 23.3', 'Italy: smoking: 43.9, cancer24.2']

Tiny Language compiler using python and regex

Hello stack overflow users
I hope you having a good
so I'm doing this tiny language compiler for my homework
tried using regex
but the output is so weird
First of all, I get an Identifier called 't' which is not used in my input
And it doesn't separate Identifier 'x' from the semicolon
thanks in advance for your help
Here is my input
read x; {input an integer }
if 0 < x then { don’t compute if x <= 0 }
fact := 1;
repeat
fact := fact * x;
x := x - 1
until x = 0;
write fact { output factorial of x }
end
And that's my code using regex
# -*- coding: utf-8 -*-
"""
Created on Wed May 13 04:11:06 2020
#author: PC
"""
class OwnCompiler (object):
def __init__ (self,file):
import re
self.file=open(file,"r").readlines()
self.symbols = {
"+":"PLUS_OP",
"-":"MINUS_OP",
"*":"MUL_OP",
"/":"DIV_OP",
"=":"EQUAL_OP",
"<":"LESS_OP",
">":"GREATER_OP",
"(":"LEFT_PARENTHESIS",
")":"RIGHT_PARENTHESIS",
":=":"ASSIGN",
";":"SEMICOLON",
}
self.commentPattern = re.compile(r".*({\n*\s*.*\s*})")
self.reservePattern = re.compile(r"\s*(read|write|if|then|else|end|repeat|until)+\s*(.*)(then)*")
self.symbolPattern = re.compile(r".*(\+|\*|-|/|=|<|>|\(|\)|;)")
self.identifierSymbol = re.compile(r".*(\w+)\s+(:=)\s+(.*)")
def compileOutput(self):
self.fileWrite=open("output.txt","w")
self.fileWrite.write("Type Token\n==================\n")
for i in self.file :
print(i)
self.getComment(i)
self.getReserveWord(i)
self.getIdentify(i)
self.fileWrite.close()#end
def getComment(self,text):
try:
self.fileWrite.write("COMMENT "+self.commentPattern.match(text).group(1)+"\n")
except:
print("NO_COMMENT")
def getReserveWord(self,text):
self.Compiled = self.reservePattern.match(text)
try:
self.fileWrite.write("RESERVE_WORD "+self.Compiled.group(1)+"\n")
self.getSymbols(self.Compiled.group(2))
try:
self.fileWrite.write("RESERVE_WORD "+self.Compiled.group(3)+"\n")
except:
print("NO_RESERVE_WORD2")
except:
print("NO_RESERVE_WORD")
def getSymbols(self,text):
self.Compiled= self.symbolPattern.match(text)
self.GOT_TOKEN= self.getTokensSymbols(self.Compiled.group())
try:
self.fileWrite.write(self.GOT_TOKEN+" "+self.Compiled.group()+"\n")
except:
print("NO_SYMBOLS")
def getIdentify(self,text):
self.Compiled = self.identifierSymbol.match(text)
try:
self.fileWrite.write("IDENTIFIER "+self.Compiled.group(1)+"\n")
self.getSymbols(text)
for i in self.Compiled.group(3):
if i ==" " :
continue
if self.isNumber(i):
self.fileWrite.write("NUMBER ")
else:
self.fileWrite.write("WORD ")
self.fileWrite.write(self.Compiled.group(3)+"\n")
except:
print("NO_IDENTIFIRES")
def getTokensSymbols(self,symbol):
try:
return self.symbols[symbol]
except:
print("NOT_DEFINED_IN_SYMBOL_DICT")
return "UNKNOWN"
def isNumber(self,text):
try:
int(text)
return True
except:
return False
if __name__ == "__main__":
instance = OwnCompiler("input.txt")
instance.compileOutput()
And here is my output
Type Token
==================
COMMENT { Sample program in TINY language – computes factorial }
COMMENT {input an integer }
RESERVE_WORD read
UNKNOWN x;
COMMENT { don’t compute if x <= 0 }
RESERVE_WORD if
UNKNOWN 0 < x then { don’t compute if x <=
IDENTIFIER t
UNKNOWN fact := 1;
RESERVE_WORD repeat
IDENTIFIER t
UNKNOWN fact := fact * x;
IDENTIFIER x
UNKNOWN x := x -
RESERVE_WORD until
UNKNOWN x = 0;
COMMENT { output factorial of x }
RESERVE_WORD write
RESERVE_WORD end
If you are going to parse a language you need a 'lexer' that will return individual tokens ignoring whitespace and comments. Along these lines, just as an example:
import re, collections
class Lexer(object):
WHITESPACE = r'(?P<WHITESPACE>\s+)'
COMMENT = r'(?P<COMMENT>{[^}]*})'
READ = r'(?P<READ>\bread\b)'
WRITE = r'(?P<WRITE>\bwrite\b)'
IF = r'(?P<IF>\bif\b)'
THEN = r'(?P<THEN>\bthen\b)'
ELSE = r'(?P<ELSE>\belse\b)'
END = r'(?P<END>\bend\b)'
REPEAT = r'(?P<REPEAT>\brepeat\b)'
UNTIL = r'(?P<UNTIL>\buntil\b)'
OPERATOR = r'(?P<OPERATOR>(?:[+*/=<>-]|:=))'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
IDENTIFIER = r'(?P<IDENTIFIER>[a-z]+)'
INTEGER = r'(?P<INTEGER>\d+)'
SEMICOLON = r'(?P<SEMICOLON>;)'
regex = re.compile('|'.join([
WHITESPACE,
COMMENT,
READ,
WRITE,
IF,
THEN,
ELSE,
END,
REPEAT,
UNTIL,
OPERATOR,
LPAREN,
RPAREN,
IDENTIFIER,
INTEGER,
SEMICOLON
]))
def __init__ (self, file):
def generate_tokens(text):
Token = collections.namedtuple('Token', ['type','value'])
scanner = Lexer.regex.finditer(text)
last_end = 0
for m in scanner:
start = m.start()
end = m.end()
if start != last_end:
# skipped over text to find the next token implies that there was unrecognizable text or an "error token"
text = self.text[last_end:start]
token = Token('ERROR', text)
yield token
last_end = end
token = Token(m.lastgroup, m.group())
if token.type != 'WHITESPACE' and token.type != 'COMMENT':
yield token
yield Token('EOF', '<end-of-file>')
with open(file, "r") as f:
text = f.read()
self._token_generator = generate_tokens(text)
def next_token(self):
# if you call this past the "EOF" token you will get a StopIteration exception
return self._token_generator.__next__()
lexer = Lexer('input.txt')
while True:
token = lexer.next_token()
print(token)
if token.type == 'EOF':
break
Prints:
Token(type='READ', value='read')
Token(type='IDENTIFIER', value='x')
Token(type='SEMICOLON', value=';')
Token(type='IF', value='if')
Token(type='INTEGER', value='0')
Token(type='OPERATOR', value='<')
Token(type='IDENTIFIER', value='x')
Token(type='THEN', value='then')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='INTEGER', value='1')
Token(type='SEMICOLON', value=';')
Token(type='REPEAT', value='repeat')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value='*')
Token(type='IDENTIFIER', value='x')
Token(type='SEMICOLON', value=';')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='-')
Token(type='INTEGER', value='1')
Token(type='UNTIL', value='until')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='=')
Token(type='INTEGER', value='0')
Token(type='SEMICOLON', value=';')
Token(type='WRITE', value='write')
Token(type='IDENTIFIER', value='fact')
Token(type='END', value='end')
Token(type='EOF', value='<end-of-file>')

Replace string with custom values and iterate for all occurrences

I am trying to replace some text in a file using either SED, PERL, AWK or a python script. I've tried a couple of things but can't seem to work it out.
I have the following in a text file called data.txt
&st=ALPHA&type=rec&uniId=JIM&acceptCode=123&drainNel=supp&
&st=ALPHA&type=rec&uniId=JIM&acceptCode=167&drainNel=supp&
&st=ALPHA&type=rec&uniId=SARA&acceptCode=231&drainNel=ured&
&st=ALPHA&type=rec&uniId=SARA&acceptCode=344&drainNel=iris&
&st=ALPHA&type=rec&uniId=SARA&acceptCode=349&drainNel=iris&
&st=ALPHA&type=rec&uniId=DAVE&acceptCode=201&drainNel=teef&
1) Script will take an input argument in the form of a number, e.g: 10000
2) I want to replace all the text ALPHA with the given long number as arg and increment by 100 for e.g. if uniId is the same. If it is different it will increment by 5000 for e.g.
3) I want to replace all the acceptCode to change to the first st for all lines with the same uniId
./script 10000
.. still confused? Well, the final result could be this:
&st=10000&type=rec&uniId=JIM&acceptCode=10000&drainNel=supp&
&st=10100&type=rec&uniId=JIM&acceptCode=10000&drainNel=supp&
&st=15100&type=rec&uniId=SARA&acceptCode=15100&drainNel=ured&
&st=15200&type=rec&uniId=SARA&acceptCode=15100&drainNel=iris&
&st=15300&type=rec&uniId=SARA&acceptCode=15100&drainNel=iris&
&st=20300&type=rec&uniId=DAVE&acceptCode=20300&drainNel=teef&
This ^ should be REPLACED and applied to file data.txt - not just print on screen.
Okay, here's one way, using awk (wrapped in a shell script for convenience because it's a bit too much for a one-liner):
#!/bin/sh
# Usage:
# $./transform.sh [STARTCOUNT] < data.txt > temp.txt
# $ mv -f temp.txt data.txt
awk -F '&' -v "cnt=${1:-10000}" -v 'OFS=&' \
'NR == 1 { ac = cnt; uni = $4; }
NR > 1 && $4 == uni { cnt += 100 }
$4 != uni { cnt += 5000; ac = cnt; uni = $4 }
{ $2 = "st=" cnt; $5 = "acceptCode=" ac; print }'
Running this on a file holding your sample input:
$ ./transform.sh 10000 < data.txt
&st=10000&type=rec&uniId=JIM&acceptCode=10000&drainNel=supp&
&st=10100&type=rec&uniId=JIM&acceptCode=10000&drainNel=supp&
&st=15100&type=rec&uniId=SARA&acceptCode=15100&drainNel=ured&
&st=15200&type=rec&uniId=SARA&acceptCode=15100&drainNel=iris&
&st=15300&type=rec&uniId=SARA&acceptCode=15100&drainNel=iris&
&st=20300&type=rec&uniId=DAVE&acceptCode=20300&drainNel=teef&
And a perl version that does an in-place edit of the input file:
#!/usr/bin/perl -ani -F'&'
# Usage:
# $ ./transform.pl COUNT datafile
use warnings;
use strict;
use English;
our ($count, $a, $uni);
BEGIN {
$count = shift #ARGV;
die "Missing count argument" unless defined $count and $count =~ /^\d+$/;
$ac = $count;
$uni = "";
$OFS = '&';
}
if ($NR == 1) {
$uni = $F[3];
} elsif ($uni ne $F[3]) {
$count += 5000;
$ac = $count;
$uni = $F[3];
} else {
$count += 100;
}
$F[1] = "st=$count";
$F[4] = "acceptCode=$ac";
print #F;
Running it on your sample input:
$ ./transform.pl 10000 data.txt
$ cat data.txt
&st=10000&type=rec&uniId=JIM&acceptCode=10000&drainNel=supp&
&st=10100&type=rec&uniId=JIM&acceptCode=10000&drainNel=supp&
&st=15100&type=rec&uniId=SARA&acceptCode=15100&drainNel=ured&
&st=15200&type=rec&uniId=SARA&acceptCode=15100&drainNel=iris&
&st=15300&type=rec&uniId=SARA&acceptCode=15100&drainNel=iris&
&st=20300&type=rec&uniId=DAVE&acceptCode=20300&drainNel=teef&
A few assumptions
Your requirement 2) I want to replace all the text ALPHA with the given long number as arg and increment by 100 for e.g. if uniId is the same. If it is different it will increment by 5000 for e.g._ in conjunction with your example output requires your input data to be sorted on the uniId field. If the file is not sorted, the 100 increments and the 5000 increments will not yield the desired initial values for each uniId
The increment scheme assumes that no one uniId value will have enough records to increment into the next 5000 range set for newly identified uniId values.
#!/usr/bin/env python3
from collections import OrderedDict
import csv
import sys
class TrackingVars(object):
"""
The TrackingVars class manages the business logic for maintaining the
st field counters and the acctCode values for each uniId
"""
def __init__(self, long_number):
self.uniId_table = {}
self.running_counter = long_number
def __initial_value__(self):
"""
The first encounter for a uniId will have st = acctCode
"""
retval = (self.running_counter, self.running_counter)
return retval
def get_uniId(self, id):
"""
A convenience method for returning uniId tracking values
"""
curval, original_value = self.uniId_table.get(id, self.__initial_value__())
return (curval, original_value)
def track(self, uniId):
"""
curval = original_value when a new uniId is encountered.
If the uniId is known, simply increment curval by 100
if the uniId is new and there is at least 1 key in the
tracking table increment curval by 5000
always update tracking variables
"""
curval, original_value = self.get_uniId(uniId)
if uniId in self.uniId_table.keys():
curval = curval + 100
else:
if self.uniId_table:
curval = curval + 5000
original_value = curval
self.running_counter = curval
retval = (curval, original_value)
self.uniId_table[uniId] = retval
return retval
def data_lines(filename):
"""
Read file as input delimited by &
"""
with open(filename, "r", newline=None) as fin:
csvin = csv.reader(fin, delimiter="&")
for row in csvin:
yield row
def transform_data_line(line):
"""
Transform data into key, values pairs
leading and traling & have no valid key, value pairs
"""
head = ("head", None)
tail = ("tail", None)
items = [head]
for field in line[1:-1]:
key, value = field.split("=")
items.append([key, value])
retval = OrderedDict(items)
retval["tail"] = tail
return retval
def process_data_line(record, text_to_replace, tracking_vars):
"""
if st value is ALPHA update record with tracking variables
"""
st = record.get("st")
if st is not None:
if st == text_to_replace:
uniId = record.get("uniId")
curval, original_value = tracking_vars.track(uniId)
record["st"] = curval
record["acceptCode"] = original_value
return record
def process_file():
"""
Get the long number from the command line input.
Initialize the tracking variables.
Process each row of the file.
"""
long_number = sys.argv[1]
tracking_vars = TrackingVars(int(long_number))
for row in data_lines("data.txt"):
record = transform_data_line(row)
retval = process_data_line(record, "ALPHA", tracking_vars)
yield retval
def write(iter_in, filename_out):
"""
Write each row from the iterator to the csv.
make sure the first and last fields are empty.
"""
with open(filename_out, "w", newline=None) as fout:
csvout = csv.writer(fout, delimiter="&")
for row in iter_in:
encoded_row = ["{0}={1}".format(k, v) for k, v in row.items()]
encoded_row[0]=""
encoded_row[-1]=""
csvout.writerow(encoded_row)
if __name__ == "__main__":
write(process_file(), "data.new.txt")
Output
$cat data.net.txt
&st=10000&type=rec&uniId=JIM&acceptCode=10000&drainNel=supp&
&st=10100&type=rec&uniId=JIM&acceptCode=10000&drainNel=supp&
&st=15100&type=rec&uniId=SARA&acceptCode=15100&drainNel=ured&
&st=15200&type=rec&uniId=SARA&acceptCode=15100&drainNel=iris&
&st=15300&type=rec&uniId=SARA&acceptCode=15100&drainNel=iris&
&st=20300&type=rec&uniId=DAVE&acceptCode=20300&drainNel=teef&
Conclusion
Only you know why the business rules for the incrementing number scheme are the way they are. However having a control break on uniId and the st value dependent upon the previous uniId increment seems problematic to me. You could process unsorted files if each new uniId encountered would start at a new 5000 boundary. For example 15000, 2000, 25000, etc.
P.S
I love the AWK and Perl answers. They are simple and straight forward. They answer the question exactly as it was posed. Now all we need is a SED example :)
just bit more efficient control, in one line gnu awk:
awk -F\& -vi=10000 -vOFS=\& '{if(NR==1) { ac=i; u=$4; } else { if($4==u) i+=100; else { i+=5000; ac=i; u=$4; } }; $2="st=" i; $5 =gensub(/[0-9]+/,ac,1,$5); print } ' data.txt
accept any various string on 5th field.. Thank Shawn.

Doing operations on a large data set

I have to perform some analysis on a PSL record which contains information on DNA sequence fragments. Basically I have to find entries that are from the same read in the same contig (these are both values in the PSL entry). The problem is the PSL records are large (10-30 Mb text documents). I wrote a program that works on short records and on the long records given enough time but it took way longer than specified. I was told the program shouldn't take more than ~15 seconds. Mine took over 15 minutes.
PSL records look like this:
275 11 0 0 0 0 0 0 - M02034:35:000000000-A7UU0:1:1101:19443:1992/2 286 0 286 NODE_406138_length_13407_cov_13.425076 13465 408 694 1 286, 0, 408,
171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,
188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,
My code looks like this:
import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):
object attributes:
fname: the initial file name
methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt
readPSLpairs() : yields psl pairs that support a circular hypothesis
Author: David Bernick
Date: May 12, 2013
'''
def __init__ (self, fname=''):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.
This method selects for alignments that could may be part of a
circle.
Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).
For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus
so...
any read(+) near the 3', or
any read(-) near the 5'
'''
nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:
for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])
if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue
yield line
def readPSLpairs (self):
read1 = []
read2 = []
for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)
output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break
print(output)
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()
I was given some example code that looks like this:
def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]
for name, instrument, num in thisStream:
if name != thisGroup:
doSomethingPairwise(thisGroupList)
thisGroup = name
thisGroupList = [ [], [], [] ]
thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)
But when I tried to implement it my program still took a long time. Am I thinking about this the wrong way? I realize the nested loop is slow but I don't see an alternative.
Edit: I figured it out, the data was presorted which made my brute force solution very impractical and unnecessary.
I hope help you, since, the question needs a best input example file
#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))
class PSLreader :
def __init__ (self, fname=''):
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc
#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl
#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()
from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)
output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1
Note: 10-30 Mb isn't large file, if you ask me

Python unified diff with line numbers from both "files"

I'm trying to figure out a way to create unified diffs with line numbers only showing N lines of context. I have been unable to do this with difflib.unified_diff. I need to show changes in both files.
The closest I can come is using diff on the command line like so:
/usr/bin/diff
--unchanged-line-format=' %.2dn %L'
--old-line-format="-%.2dn %L"
--new-line-format="+%.2dn %L"
file1.py
file2.py
BUT I only want to show N lines of context, and /usr/bin/diff doesn't seem to support context with a custom line format (eg. -U2 is not compatible with --line-format "conflicting output style options").
Below is an example of what I'd like to accomplish (the same output as the above diff, but only showing 1 line of context surrounding changes):
+01: def renamed_function()
-01: def original_function():
02:
+03: """ Neat stuff here """
04:
21:
+22: # Here's a new comment
23:
85: # Output the value of foo()
+86: print "Foo is %s"%(foo())
-86: print foo()
87:
I was able to figure out something very close to what I wanted to do. It's slower than regular diff, though. Here's the entire code, from my project GitGate.
def unified_diff(to_file_path, from_file_path, context=1):
""" Returns a list of differences between two files based
on some context. This is probably over-complicated. """
pat_diff = re.compile(r'## (.[0-9]+\,[0-9]+) (.[0-9]+,[0-9]+) ##')
from_lines = []
if os.path.exists(from_file_path):
from_fh = open(from_file_path,'r')
from_lines = from_fh.readlines()
from_fh.close()
to_lines = []
if os.path.exists(to_file_path):
to_fh = open(to_file_path,'r')
to_lines = to_fh.readlines()
to_fh.close()
diff_lines = []
lines = difflib.unified_diff(to_lines, from_lines, n=context)
for line in lines:
if line.startswith('--') or line.startswith('++'):
continue
m = pat_diff.match(line)
if m:
left = m.group(1)
right = m.group(2)
lstart = left.split(',')[0][1:]
rstart = right.split(',')[0][1:]
diff_lines.append("## %s %s ##\n"%(left, right))
to_lnum = int(lstart)
from_lnum = int(rstart)
continue
code = line[0]
lnum = from_lnum
if code == '-':
lnum = to_lnum
diff_lines.append("%s%.4d: %s"%(code, lnum, line[1:]))
if code == '-':
to_lnum += 1
elif code == '+':
from_lnum += 1
else:
to_lnum += 1
from_lnum += 1
return diff_lines

Categories

Resources