Doing operations on a large data set - python
I have to perform some analysis on a PSL record which contains information on DNA sequence fragments. Basically I have to find entries that are from the same read in the same contig (these are both values in the PSL entry). The problem is the PSL records are large (10-30 Mb text documents). I wrote a program that works on short records and on the long records given enough time but it took way longer than specified. I was told the program shouldn't take more than ~15 seconds. Mine took over 15 minutes.
PSL records look like this:
275 11 0 0 0 0 0 0 - M02034:35:000000000-A7UU0:1:1101:19443:1992/2 286 0 286 NODE_406138_length_13407_cov_13.425076 13465 408 694 1 286, 0, 408,
171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,
188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,
My code looks like this:
import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):
object attributes:
fname: the initial file name
methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt
readPSLpairs() : yields psl pairs that support a circular hypothesis
Author: David Bernick
Date: May 12, 2013
'''
def __init__ (self, fname=''):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.
This method selects for alignments that could may be part of a
circle.
Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).
For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus
so...
any read(+) near the 3', or
any read(-) near the 5'
'''
nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:
for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])
if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue
yield line
def readPSLpairs (self):
read1 = []
read2 = []
for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)
output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break
print(output)
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()
I was given some example code that looks like this:
def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]
for name, instrument, num in thisStream:
if name != thisGroup:
doSomethingPairwise(thisGroupList)
thisGroup = name
thisGroupList = [ [], [], [] ]
thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)
But when I tried to implement it my program still took a long time. Am I thinking about this the wrong way? I realize the nested loop is slow but I don't see an alternative.
Edit: I figured it out, the data was presorted which made my brute force solution very impractical and unnecessary.
I hope help you, since, the question needs a best input example file
#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))
class PSLreader :
def __init__ (self, fname=''):
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc
#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl
#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()
from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)
output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1
Note: 10-30 Mb isn't large file, if you ask me
Related
Attempting to Traverse a search API Collecting results in Python 3.9
I am attempting to yield all results from a search api. There are 3 key cases: Case 1: There are 0 results yielded Case 2: There are 1-9 results yielded Case 3: There are 10 results yielded In both cases 1 and 2, we can conclude that there are no results deeper. i.e if we searched for "iiiiia" and it yields 0 then that means there are no results for "iiiiia" and no results deeper than "iiiiia". If "iiiiia" yields 1-9 results, then we can concluded "iiiiia" yields 109 results and no results deeper. In case 3, if we search for "iiiiia" and it yields 10, we can conclude that "iiiiia" has 10 results, and there may or may not be results deeper as 10 means 10+ results possible. In this case we would have to move one layer down the chain and traverse through those results. i.e "iiiiia0"-"iiiiiaz" and if any of those yield 10 we would also have to go one layer deeper. an example of how this may look: a a0 a00 a000 a0000 a0001 ... a000z a001 a002 a003 a0030 ... a0034 a00340 ... a0034z a0035... Here is my attempted code: import json from time import sleep import urllib3 import requests import re import random import sys import numpy # Saving the reference of the standard output original_stdout = sys.stdout header = {} link = '/search?query=' http = requests.session() fileName = '/output.txt' hasCompleted = 0 number = '11' def search(target): global targetList global link global header global http resp = http.request('GET',link+target,headers=header) match = re.findall('"key":"(.+?)","',resp.text) if len(match) == 10: return False if len(match) == 0: return " " elif len(match) < 10: return resp.text def treeSearch(): global fileName global number global hasCompleted if hasCompleted == 1: new = (int(int(number, 36) / 36) + 1) new = numpy.base_repr(new, 36) number = new hasCompleted = 0 if hasCompleted == 0: x = 0 new = int(int(number, 36)*36) new = numpy.base_repr(new, 36) number = new while x < 37: new = int(int(number, 36) + 1) new = numpy.base_repr(new, 36) number = new result = search(number) print(number) if result: with open(fileName, 'a+') as f: sys.stdout = f print(result) sys.stdout = original_stdout x = x + 1 else: treeSearch() temp = number number = (int(int(number, 36) / 36) + 1) #maybe not + 1 print(number) number = numpy.base_repr(number, 36) print(number) result = search(number) if not result == " ": new = int(int(number, 36)*36) new = numpy.base_repr(new, 36) number = new hasCompleted = 1 treeSearch() Here is my output: 111 1111 11111 111111 1111111 11111111 111111111 1111111111 11111111111 111111111111 1111111111111 11111111111111 111111111111111 1111111111111111 1111111111111112 1111111111111113 1111111111111114 1111111111111115 1111111111111116 1111111111111117 1111111111111118 1111111111111119 111111111111111A 111111111111111B 111111111111111C 111111111111111D 111111111111111E 111111111111111F 111111111111111G 111111111111111H 111111111111111I 111111111111111J 111111111111111K 111111111111111L 111111111111111M 111111111111111N 111111111111111O 111111111111111P 111111111111111Q 111111111111111R 111111111111111S 111111111111111T 111111111111111U 111111111111111V 111111111111111W 111111111111111X 111111111111111Y 111111111111111Z 1111111111111120 1111111111111121 6316397706306666889217 11111111110QR5T 11111111110QR5U 11111111110QR5V 11111111110QR5W 11111111110QR5X 11111111110QR5Y 11111111110QR5Z 11111111110QR60 11111111110QR61 11111111110QR62 11111111110QR63 11111111110QR64 11111111110QR65 11111111110QR66 11111111110QR67 11111111110QR68 11111111110QR69 11111111110QR6A 11111111110QR6B 11111111110QR6C 11111111110QR6D 11111111110QR6E 11111111110QR6F 11111111110QR6G 11111111110QR6H 11111111110QR6I 11111111110QR6J 11111111110QR6K 11111111110QR6L 11111111110QR6M 11111111110QR6N 11111111110QR6O 11111111110QR6P 11111111110QR6Q 11111111110QR6R 11111111110QR6S 11111111110QR6T 11111111110QR6U 175455491841851850753 11111111110L4X 11111111110L4Y 11111111110L4Z 11111111110L50 11111111110L51 11111111110L52 11111111110L53 11111111110L54 11111111110L55 11111111110L56 11111111110L57 11111111110L58 11111111110L59 11111111110L5A 11111111110L5B 11111111110L5C 11111111110L5D 11111111110L5E 11111111110L5F 11111111110L5G 11111111110L5H 11111111110L5I 11111111110L5J 11111111110L5K 11111111110L5L 11111111110L5M 11111111110L5N 11111111110L5O 11111111110L5P 11111111110L5Q 11111111110L5R 11111111110L5S 11111111110L5T 11111111110L5U 11111111110L5V 11111111110L5W 11111111110L5X 11111111110L5Y 4873763662273662977 11111111110XT 11111111110XU 11111111110XV 11111111110XW 11111111110XX 11111111110XY 11111111110XZ 11111111110Y0 11111111110Y1 11111111110Y2 11111111110Y3 11111111110Y4 11111111110Y5 11111111110Y6 11111111110Y7 11111111110Y8 11111111110Y9 11111111110YA 11111111110YB 11111111110YC 11111111110YD 11111111110YE 11111111110YF 11111111110YG 11111111110YH 11111111110YI 11111111110YJ 11111111110YK 11111111110YL 11111111110YM 11111111110YN 11111111110YO 11111111110YP 11111111110YQ 11111111110YR 11111111110YS 11111111110YT 11111111110YU 135382323952046193 11111111110X 11111111110Y 11111111110Z 111111111110 111111111111 111111111121 111111111122 111111111123 111111111124 111111111125 111111111126 111111111127 111111111128 111111111129 11111111112A 11111111112B 11111111112C 11111111112D 11111111112E 11111111112F 11111111112G 11111111112H 11111111112I 11111111112J 11111111112K 11111111112L 11111111112M 11111111112N 11111111112O 11111111112P 11111111112Q 11111111112R 11111111112S 11111111112T 11111111112U 11111111112V 11111111112W 11111111112X 11111111112Y 11111111112Z 111111111130 111111111131 3760620109779064 11111111114 111111111141 111111111142 111111111143 111111111144 111111111145 111111111146 111111111147 111111111148 111111111149 11111111114A 11111111114B 11111111114C 11111111114D 11111111114E 11111111114F 11111111114G 11111111114H 11111111114I 11111111114J 11111111114K 11111111114L 11111111114M 11111111114N 11111111114O 11111111114P 11111111114Q 11111111114R 11111111114S 11111111114T 11111111114U 11111111114V 11111111114W 11111111114X 11111111114Y 3760620109779066 11111111116 11111111117 11111111118 11111111119 1111111111A 1111111111B 1111111111C 1111111111D 1111111111E 1111111111F 1111111111G 1111111111H 1111111111I 1111111111J 1111111111K 1111111111L 1111111111M 1111111111N 1111111111O 1111111111P 1111111111Q 1111111111R 1111111111S 1111111111T 1111111111U 1111111111V 1111111111W 1111111111X 1111111111Y 1111111111Z 11111111120 11111111121 11111111122 11111111123 11111111124 11111111125 11111111126 11111111127 104461669716087 1111111113 1111111114 1111111115 1111111116 1111111117 1111111118 1111111119 111111111A 111111111B 111111111C 111111111D 111111111E 111111111F 111111111G 111111111H 111111111I 111111111J 111111111K 111111111L 111111111M 111111111N 111111111O 111111111P 111111111Q 111111111R 111111111S 111111111T 111111111U 111111111V 111111111W 111111111X 111111111Y 111111111Z 1111111120 1111111121 1111111122 1111111123 1111111124 2901713047671 111111113 111111114 111111115 111111116 111111117 111111118 111111119 11111111A 11111111B 11111111C 11111111D 11111111E 11111111F 11111111G 11111111H 11111111I 11111111J 11111111K 11111111L 11111111M 11111111N 11111111O 11111111P 11111111Q 11111111R 11111111S 11111111T 11111111U 11111111V 11111111W 11111111X 11111111Y 11111111Z 111111120 111111121 111111122 111111123 111111124 80603140215 11111113 11111114 11111115 11111116 11111117 11111118 11111119 1111111A 1111111B 1111111C 1111111D 1111111E 1111111F 1111111G 1111111H 1111111I 1111111J 1111111K 1111111L 1111111M 1111111N 1111111O 1111111P 1111111Q 1111111R 1111111S 1111111T 1111111U 1111111V 1111111W 1111111X 1111111Y 1111111Z 11111120 11111121 11111122 11111123 11111124 2238976119 1111113 11111131 11111132 11111133 11111134 11111135 11111136 11111137 11111138 11111139 1111113A 1111113B 1111113C 1111113D 1111113E 1111113F 1111113G 1111113H 1111113I 1111113J 1111113K 1111113L 1111113M 1111113N 1111113O 1111113P 1111113Q 1111113R 1111113S 1111113T 1111113U 1111113V 1111113W 1111113X 1111113Y 1111113Z 11111140 11111141 2238976121 1111115 11111151 11111152 11111153 11111154 11111155 11111156 11111157 11111158 11111159 1111115A 1111115B 1111115C 1111115D 1111115E 1111115F 1111115G 1111115H 1111115I 1111115J 1111115K 1111115L 1111115M 1111115N 1111115O 1111115P 1111115Q 1111115R 1111115S 1111115T 1111115U 1111115V 1111115W 1111115X 1111115Y 1111115Z 11111160 11111161 2238976123 1111117 11111171 11111172 11111173 11111174 11111175 11111176 11111177 11111178 11111179 1111117A 1111117B 1111117C 1111117D 1111117E 1111117F 1111117G 1111117H 1111117I 1111117J 1111117K 1111117L 1111117M 1111117N 1111117O 1111117P 1111117Q 1111117R 1111117S 1111117T 1111117U 1111117V 1111117W 1111117X 1111117Y 1111117Z 11111180 11111181 2238976125 1111119 111111A 111111B 111111C 111111D 111111E 111111F 111111G 111111H 111111I 111111J 111111K 111111L 111111M 111111N 111111O 111111P 111111Q 111111R 111111S 111111T 111111U 111111V 111111W 111111X 111111Y 111111Z 1111120 1111121 1111122 1111123 1111124 1111125 1111126 1111127 1111128 1111129 111112A 62193783 111113 1111131 1111132 1111133 1111134 1111135 1111136 1111137 1111138 1111139 111113A 111113B 111113C 111113D 111113E 111113F 111113G 111113H 111113I 111113J 111113K 111113L 111113M 111113N 111113O 111113P 111113Q 111113R 111113S 111113T 111113U 111113V 111113W 111113X 111113Y 111113Z 1111140 1111141 62193785 111115 My code traverses in only deeper once then comes out, I will keep working on my code however I am hoping to find an easier solution or possibly a library that can perform this style of search. Thanks!
Python unified diff with line numbers from both "files"
I'm trying to figure out a way to create unified diffs with line numbers only showing N lines of context. I have been unable to do this with difflib.unified_diff. I need to show changes in both files. The closest I can come is using diff on the command line like so: /usr/bin/diff --unchanged-line-format=' %.2dn %L' --old-line-format="-%.2dn %L" --new-line-format="+%.2dn %L" file1.py file2.py BUT I only want to show N lines of context, and /usr/bin/diff doesn't seem to support context with a custom line format (eg. -U2 is not compatible with --line-format "conflicting output style options"). Below is an example of what I'd like to accomplish (the same output as the above diff, but only showing 1 line of context surrounding changes): +01: def renamed_function() -01: def original_function(): 02: +03: """ Neat stuff here """ 04: 21: +22: # Here's a new comment 23: 85: # Output the value of foo() +86: print "Foo is %s"%(foo()) -86: print foo() 87:
I was able to figure out something very close to what I wanted to do. It's slower than regular diff, though. Here's the entire code, from my project GitGate. def unified_diff(to_file_path, from_file_path, context=1): """ Returns a list of differences between two files based on some context. This is probably over-complicated. """ pat_diff = re.compile(r'## (.[0-9]+\,[0-9]+) (.[0-9]+,[0-9]+) ##') from_lines = [] if os.path.exists(from_file_path): from_fh = open(from_file_path,'r') from_lines = from_fh.readlines() from_fh.close() to_lines = [] if os.path.exists(to_file_path): to_fh = open(to_file_path,'r') to_lines = to_fh.readlines() to_fh.close() diff_lines = [] lines = difflib.unified_diff(to_lines, from_lines, n=context) for line in lines: if line.startswith('--') or line.startswith('++'): continue m = pat_diff.match(line) if m: left = m.group(1) right = m.group(2) lstart = left.split(',')[0][1:] rstart = right.split(',')[0][1:] diff_lines.append("## %s %s ##\n"%(left, right)) to_lnum = int(lstart) from_lnum = int(rstart) continue code = line[0] lnum = from_lnum if code == '-': lnum = to_lnum diff_lines.append("%s%.4d: %s"%(code, lnum, line[1:])) if code == '-': to_lnum += 1 elif code == '+': from_lnum += 1 else: to_lnum += 1 from_lnum += 1 return diff_lines
Parse svg:path d attribute
I need to decipher a path element in an SVG document to drive a CNC machine along that path. I wonder if there are any Python libraries that parse SVG and give some sort of pythonic list for the d attribute, e.g.: <path d="M 20 30 L 20 20 20 40 40 40"/> parses into [["M", 20, 30], ["L", 20, 20], ["L", 20, 40], ["L", 40, 40]]
Here's a start it's written by me and in python 2.7.2. Just delete the tests and print statements if you want to. Copyright 2012 Christopher L. Ramsey Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. from collections import OrderedDict from re import match from re import split from re import sub class PathIterator(object): EOI = 'End of Iteration' PATH_IDENTIFIERS = r'[MLHVCSQTAmlhvcsqa]' NUMBERS = r'[0-9.-^A-z]' SEPERATORS = r'\s|\,' PATH_END = r'[Zz]' def __init__(self, path): self.parseable = path.translate(None, '\t\f') self.parseable = self.parseable.replace('\n', ' ') print 'strip_newlines: {}'.format(self.parseable) self.parseable = sub(r'([A-Za-z])([0-9]|\-)', self.insert, self.parseable) print 'add_space: {}'.format(self.parseable) self.parseable = self.parseable.replace(',', ' ') print 'replace_commas: {}'.format(self.parseable) self.parseable = sub(r'\s+', ' ', self.parseable) # replace any double space with a single space print 'strip_extra_space: {}'.format(self.parseable) self.tokens = split(' ', self.parseable) self.map = self.produce_map(self.tokens) print self.map self.elements = self.process(self.map) def produce_map(self, tkns): self.m = OrderedDict() self.i = 0 while self.i < len(tkns): if match(self.PATH_IDENTIFIERS, tkns[self.i]): self.m[self.i] = tkns[self.i] elif match(self.PATH_END, tkns[self.i]): self.m[self.i] = tkns[self.i] else: pass self.i += 1 return self.m.items() def process(self, map): self.mm = [] self.l = len(map) for e in range(self.l): try: self.element = map[e] self.future = map[e + 1] self.ident = self.element[1] self.start = self.element[0] + 1 self.end = self.future[0] self.nbrs = self.tokens[self.start:self.end] except: self.element = map[e] self.ident = self.element[1] self.start = self.element[0] + 1 self.end = len(self.tokens) self.nbrs = self.tokens[self.start:self.end] print 'start: {} end {}'.format(self.start, self.end) finally: self.numbers = [] for number in self.nbrs: self.numbers.append(float(number)) self.mm.append((self.ident, self.numbers)) return iter(self.mm) def next(self): try: return self.elements.next() except: return self.EOI def insert(self, match_obj): self.group = match_obj.group() return '{} {}'.format(self.group[0], self.group[1]) if __name__ == '__main__': inkscape_path = "M 12,90 C 8.676,90 6,87.324 6,84 L 6,82 6,14 6,12 c 0,-0.334721 0.04135,-0.6507 0.09375,-0.96875 0.0487,-0.295596 0.09704,-0.596915 0.1875,-0.875 C 6.29113,10.12587 6.302142,10.09265 6.3125,10.0625 6.411365,9.774729 6.5473802,9.515048 6.6875,9.25 6.8320918,8.976493 7.0031161,8.714385 7.1875,8.46875 7.3718839,8.223115 7.5612765,7.995278 7.78125,7.78125 8.221197,7.353194 8.72416,6.966724 9.28125,6.6875 9.559795,6.547888 9.8547231,6.440553 10.15625,6.34375 9.9000482,6.443972 9.6695391,6.580022 9.4375,6.71875 c -0.00741,0.0044 -0.023866,-0.0045 -0.03125,0 -0.031933,0.0193 -0.062293,0.04251 -0.09375,0.0625 -0.120395,0.0767 -0.2310226,0.163513 -0.34375,0.25 -0.1061728,0.0808 -0.2132809,0.161112 -0.3125,0.25 C 8.4783201,7.442683 8.3087904,7.626638 8.15625,7.8125 8.0486711,7.942755 7.9378561,8.077785 7.84375,8.21875 7.818661,8.25713 7.805304,8.30462 7.78125,8.34375 7.716487,8.446782 7.6510225,8.548267 7.59375,8.65625 7.4927417,8.850956 7.3880752,9.071951 7.3125,9.28125 7.30454,9.30306 7.288911,9.3218 7.28125,9.34375 7.2494249,9.4357 7.2454455,9.530581 7.21875,9.625 7.1884177,9.731618 7.1483606,9.828031 7.125,9.9375 7.0521214,10.279012 7,10.635705 7,11 l 0,2 0,68 0,2 c 0,2.781848 2.2181517,5 5,5 l 2,0 68,0 2,0 c 2.781848,0 5,-2.218152 5,-5 l 0,-2 0,-68 0,-2 C 89,10.635705 88.94788,10.279012 88.875,9.9375 88.83085,9.730607 88.78662,9.539842 88.71875,9.34375 88.71105,9.3218 88.69545,9.30306 88.6875,9.28125 88.62476,9.107511 88.549117,8.913801 88.46875,8.75 88.42717,8.6672 88.38971,8.580046 88.34375,8.5 88.28915,8.40279 88.216976,8.31165 88.15625,8.21875 88.06214,8.077785 87.951329,7.942755 87.84375,7.8125 87.700576,7.63805 87.540609,7.465502 87.375,7.3125 87.36383,7.3023 87.35502,7.29135 87.34375,7.28125 87.205364,7.155694 87.058659,7.046814 86.90625,6.9375 86.803679,6.86435 86.701932,6.784136 86.59375,6.71875 c -0.0074,-0.0045 -0.02384,0.0044 -0.03125,0 -0.232039,-0.138728 -0.462548,-0.274778 -0.71875,-0.375 0.301527,0.0968 0.596455,0.204138 0.875,0.34375 0.55709,0.279224 1.060053,0.665694 1.5,1.09375 0.219973,0.214028 0.409366,0.441865 0.59375,0.6875 0.184384,0.245635 0.355408,0.507743 0.5,0.78125 0.14012,0.265048 0.276135,0.524729 0.375,0.8125 0.01041,0.03078 0.02133,0.06274 0.03125,0.09375 0.09046,0.278085 0.1388,0.579404 0.1875,0.875 C 89.95865,11.3493 90,11.665279 90,12 l 0,2 0,68 0,2 c 0,3.324 -2.676,6 -6,6 l -72,0 z" mdn_path = "M10 80 Q 52.5 10, 95 80 T 180 80" w3c_path = "M100,200 C100,100 250,100 250,200 S400,300 400,200" w3c_path_neg = "M-100,200 C100,100 250,100 250,200 S-400,300 400,200" w3c_path_nl = ''' M600,350 l 50,-25 a25,25 -30 0,1 50,-25 l 50,-25 a25,50 -30 0,1 50,-25 l 50,-25 a25,75 -30 0,1 50,-25 l 50,-25 a25,100 -30 0,1 50,-25 l 50,-25 ''' paths = [inkscape_path, mdn_path, w3c_path, str.strip(w3c_path_nl), w3c_path_neg] for path in paths: p = PathIterator(path) char = '' while char != PathIterator.EOI: char = p.next() print char
Getting the d-string can be down in a couple lines using svgpathtools, the rest can be done using regular expressions. from svgpathtools import svg2paths paths, attributes = svg2paths('some_svg_file.svg') paths is a list of svgpathtools Path objects (containing just the curve info, no colors, styles, etc.). attributes is a list of dictionary objects of the attributes. Suppose the path you are interested in is the first (the 0th) listed in your SVG, then to extract just the d-string you can use: d = attributes[0]['d'] # d-string from first path in SVG # Now for some regular expressions magic import re split_by_letters = re.findall('[A-Z|a-z][^A-Z|a-z]*', d) split_as_you_want = [] for x in split_by_letters: nums = x[1:].replace(',',' ').split() # list of numbers after letter for k in range(len(nums) // 2): split_as_you_want.append([x[0]] + [nums[k]] + [nums[k+1]]) print split_as_you_want I didn't convert the numbers into strings here as how you want to do that depends on whether they're always integers and whether you care they stay that way. For most purposes this can be done with something like the following right below the "nums = ..." line. for k, n in enumerate(nums): try: nums[k] = int(n) except ValueError: nums[k] = float(n)
load parameters from a file in Python
I am writing a Python class to model a process and I want to initialized the parameters from a file, say 'input.dat'. The format of the input file looks like this. 'input.dat' file: Z0: 0 0 k: 0.1 g: 1 Delta: 20 t_end: 300 The code I wrote is the following. It works but appears redundant and inflexible. Is there a better way to do the job? Such as a loop to do readline() and then match the keyword? def load(self,filename="input.dat"): FILE = open(filename) s = FILE.readline().split() if len(s) is 3: self.z0 = [float(s[1]),float(s[2])] # initial state s = FILE.readline().split() if len(s) is 2: self.k = float(s[1]) # kappa s = FILE.readline().split() if len(s) is 2: self.g = float(s[1]) s = FILE.readline().split() if len(s) is 2: self.D = float(s[1]) # Delta s = FILE.readline().split() if len(s) is 2: self.T = float(s[1]) # end time
Assuming the params are coming from a safe place (made by you or users, not the internet), just make the parameters file a Python file, params.py: Z0 = (0, 0) k = 0.1 g = 1 Delta = 20 t_end = 300 Then in your code all you need is: import params fancy_calculation(10, k=params.k, delta=params.Delta) The beauty of this is two-fold: 1) simplicity, and 2) you can use the power of Python in your parameter descriptions -- particularly useful here, for example: k = 0.1 Delta = 20 g = 3 * k + Delta Alternatively, you could use Python's built-in JSON or ConfigParser .INI parser modules.
If you are open to some other kind of file where you can keep your parameters, I would suggest you to use a YAML file. The Python library is PyYAML. This is how you can easily use it with Python. For a better introduction, look at this Wikipedia article: http://en.wikipedia.org/wiki/YAML. The benefit is you can read the parameter values as lists or maps. You would love it!
Try the following: def load(self, filename="input.dat"): d = {"Z0": "z0", "k": "k", "g": "g", "Delta": "D", "t_end": "T"} FILE = open(filename) for line in FILE: name, value = line.split(":") value = value.strip() if " " in value: value = map(float, value.split()) else: value = float(value) setattr(self, d[name], value) Proof that it works: >>> class A(object): pass ... >>> a = A() >>> load(a) >>> a.__dict__ {'k': 0.10000000000000001, 'z0': [0.0, 0.0], 'D': 20.0, 'g': 1.0, 'T': 300.0}
As others have mentioned, in Python you can create object attributes dynamically "on the fly". That means you could do something like the following to create Params objects as they're read-in. I've tried to make the code as data-driven as possible, so relatively flexible. # maps label to attribute name and types label_attr_map = { "Z0:": ["z0", float, float], "k:": [ "k", float], "g:": [ "g", float], "Delta:": [ "D", float], "t_end:": [ "T", float] } class Params(object): def __init__(self, input_file_name): with open(input_file_name, 'r') as input_file: for line in input_file: row = line.split() label = row[0] data = row[1:] # rest of row is data list attr = label_attr_map[label][0] datatypes = label_attr_map[label][1:] values = [(datatypes[i](data[i])) for i in range(len(data))] self.__dict__[attr] = values if len(values) > 1 else values[0] params = Params('input.dat') print 'params.z0:', params.z0 print 'params.k:', params.k print 'params.g:', params.g print 'params.D:', params.D print 'params.T:', params.T Output: params.z0: [0.0, 0.0] params.k: 0.1 params.g: 1.0 params.D: 20.0 params.T: 300.0
Perhaps this might give you what you need: def load(self,filename='input.dat'): with open(filename) as fh: for line in fh: s = line.split() if len(s) == 2: setattr(self,s[1],s[2]) elif len(s) == 3: setattr(self,s[1],s[2:]) I also didn't include any error checking, but setattr is very handy.
Something like this: def load(self,filename="input.dat"): # maps names to number of fields they need # only necessary for variables with more than 1 field argmap = dict(Z0=2) # maps config file names to their attribute names on the object # if name is the same both places, no need namemap = dict(Z0="z0", Delta="D", t_end="T") with open(filename) as FILE: for line in FILE: s = line.split() var = s[0].rstrip(":") try: val = [float(x) for x in s[1:]] except ValueError: continue if len(val) == varmap.get(var, 1): if len(val) == 1: val = val[0] setattr(self, namemap.get(var, var), val)
Python objects have a built-in __dict__ member. You can modify it, and then refer to properties as obj.key. class Data(object): def __init__(self, path='infile.dat'): with open(path, 'r') as fo: for line in fo.readlines(): if len(line) < 2: continue parts = [s.strip(' :\n') for s in line.split(' ', 1)] numbers = [float(s) for s in parts[1].split()] # This is optional... do you want single values to be stored in lists? if len(numbers) == 1: numbers = numbers[0] self.__dict__[parts[0]] = numbers # print parts -- debug obj = Data('infile.dat') print obj.g print obj.Delta print obj.Z0 At the end of this, we print out a few of the keys. Here's the output of those. 1.0 20.0 [0.0, 0.0] For consistency, you can remove the line marked "optional" in my code, and have all objects in lists -- regardless of how many elements they have. That will make using them quite a bit easier, because you never have to worry about obj.g[0] returning an error.
Here's another one def splitstrip(s): return s.split(':')[1].strip() with open('input.dat','r') as f: a.z0 = [float(x) for x in splitstrip(f.readline()).split(' ')] a.k, a.g, a.D, a.T = tuple([float(splitstrip(x)) for x in f.read().rstrip().split('\n')]) ;)
Python: File formatting
I have a for loop which references a dictionary and prints out the value associated with the key. Code is below: for i in data: if i in dict: print dict[i], How would i format the output so a new line is created every 60 characters? and with the character count along the side for example: 0001 MRQLLLISDLDNTWVGDQQALEHLQEYLGDRRGNFYLAYATGRSYHSARELQKQVGLMEP 0061 DYWLTAVGSEIYHPEGLDQHWADYLSEHWQRDILQAIADGFEALKPQSPLEQNPWKISYH 0121 LDPQACPTVIDQLTEMLKETGIPVQVIFSSGKDVDLLPQRSNKGNATQYLQQHLAMEPSQ
It's a finicky formatting problem, but I think the following code: import sys class EveryN(object): def __init__(self, n, outs): self.n = n # chars/line self.outs = outs # output stream self.numo = 1 # next tag to write self.tll = 0 # tot chars on this line def write(self, s): while True: if self.tll == 0: # start of line: emit tag self.outs.write('%4.4d ' % self.numo) self.numo += self.n # wite up to N chars/line, no more numw = min(len(s), self.n - self.tll) self.outs.write(s[:numw]) self.tll += numw if self.tll >= self.n: self.tll = 0 self.outs.write('\n') s = s[numw:] if not s: break if __name__ == '__main__': sys.stdout = EveryN(60, sys.stdout) for i, a in enumerate('abcdefgh'): print a*(5+ i*5), shows how to do it -- the output when running for demonstration purposes as the main script (five a's, ten b's, etc, with spaces in-between) is: 0001 aaaaa bbbbbbbbbb ccccccccccccccc dddddddddddddddddddd eeeeee 0061 eeeeeeeeeeeeeeeeeee ffffffffffffffffffffffffffffff ggggggggg 0121 gggggggggggggggggggggggggg hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh 0181 hhhhhhh
# test data data = range(10) the_dict = dict((i, str(i)*200) for i in range( 10 )) # your loops as a generator lines = ( the_dict[i] for i in data if i in the_dict ) def format( line ): def splitter(): k = 0 while True: r = line[k:k+60] # take a 60 char block if r: # if there are any chars left yield "%04d %s" % (k+1, r) # format them else: break k += 60 return '\n'.join(splitter()) # join all the numbered blocks for line in lines: print format(line)
I haven't tested it on actual data, but I believe the code below would do the job. It first builds up the whole string, then outputs it a 60-character line at a time. It uses the three-argument version of range() to count by 60. s = ''.join(dict[i] for i in data if i in dict) for i in range(0, len(s), 60): print '%04d %s' % (i+1, s[i:i+60])
It seems like you're looking for textwrap The textwrap module provides two convenience functions, wrap() and fill(), as well as TextWrapper, the class that does all the work, and a utility function dedent(). If you’re just wrapping or filling one or two text strings, the convenience functions should be good enough; otherwise, you should use an instance of TextWrapper for efficiency.