Doing operations on a large data set

Doing operations on a large data set - python

I have to perform some analysis on a PSL record which contains information on DNA sequence fragments. Basically I have to find entries that are from the same read in the same contig (these are both values in the PSL entry). The problem is the PSL records are large (10-30 Mb text documents). I wrote a program that works on short records and on the long records given enough time but it took way longer than specified. I was told the program shouldn't take more than ~15 seconds. Mine took over 15 minutes.
PSL records look like this:
275 11 0 0 0 0 0 0 - M02034:35:000000000-A7UU0:1:1101:19443:1992/2 286 0 286 NODE_406138_length_13407_cov_13.425076 13465 408 694 1 286, 0, 408,
171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,
188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,
My code looks like this:
import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):
object attributes:
fname: the initial file name
methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt
readPSLpairs() : yields psl pairs that support a circular hypothesis
Author: David Bernick
Date: May 12, 2013
'''
def __init__ (self, fname=''):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.
This method selects for alignments that could may be part of a
circle.
Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).
For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus
so...
any read(+) near the 3', or
any read(-) near the 5'
'''
nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:
for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])
if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue
yield line
def readPSLpairs (self):
read1 = []
read2 = []
for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)
output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break
print(output)
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()
I was given some example code that looks like this:
def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]
for name, instrument, num in thisStream:
if name != thisGroup:
doSomethingPairwise(thisGroupList)
thisGroup = name
thisGroupList = [ [], [], [] ]
thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)
But when I tried to implement it my program still took a long time. Am I thinking about this the wrong way? I realize the nested loop is slow but I don't see an alternative.
Edit: I figured it out, the data was presorted which made my brute force solution very impractical and unnecessary.

I hope help you, since, the question needs a best input example file
#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))
class PSLreader :
def __init__ (self, fname=''):
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc
#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl
#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()
from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)
output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1
Note: 10-30 Mb isn't large file, if you ask me

Related

Attempting to Traverse a search API Collecting results in Python 3.9

I am attempting to yield all results from a search api.
There are 3 key cases:
Case 1: There are 0 results yielded
Case 2: There are 1-9 results yielded
Case 3: There are 10 results yielded
In both cases 1 and 2, we can conclude that there are no results deeper. i.e if we searched for "iiiiia" and it yields 0 then that means there are no results for "iiiiia" and no results deeper than "iiiiia". If "iiiiia" yields 1-9 results, then we can concluded "iiiiia" yields 109 results and no results deeper.
In case 3, if we search for "iiiiia" and it yields 10, we can conclude that "iiiiia" has 10 results, and there may or may not be results deeper as 10 means 10+ results possible. In this case we would have to move one layer down the chain and traverse through those results. i.e "iiiiia0"-"iiiiiaz" and if any of those yield 10 we would also have to go one layer deeper.
an example of how this may look:
a
a0
a00
a000
a0000
a0001
...
a000z
a001
a002
a003
a0030
...
a0034
a00340
...
a0034z
a0035...
Here is my attempted code:
import json
from time import sleep
import urllib3
import requests
import re
import random
import sys
import numpy
# Saving the reference of the standard output
original_stdout = sys.stdout
header = {}
link = '/search?query='
http = requests.session()
fileName = '/output.txt'
hasCompleted = 0
number = '11'
def search(target):
global targetList
global link
global header
global http
resp = http.request('GET',link+target,headers=header)
match = re.findall('"key":"(.+?)","',resp.text)
if len(match) == 10:
return False
if len(match) == 0:
return " "
elif len(match) < 10:
return resp.text
def treeSearch():
global fileName
global number
global hasCompleted
if hasCompleted == 1:
new = (int(int(number, 36) / 36) + 1)
new = numpy.base_repr(new, 36)
number = new
hasCompleted = 0
if hasCompleted == 0:
x = 0
new = int(int(number, 36)*36)
new = numpy.base_repr(new, 36)
number = new
while x < 37:
new = int(int(number, 36) + 1)
new = numpy.base_repr(new, 36)
number = new
result = search(number)
print(number)
if result:
with open(fileName, 'a+') as f:
sys.stdout = f
print(result)
sys.stdout = original_stdout
x = x + 1
else:
treeSearch()
temp = number
number = (int(int(number, 36) / 36) + 1) #maybe not + 1
print(number)
number = numpy.base_repr(number, 36)
print(number)
result = search(number)
if not result == " ":
new = int(int(number, 36)*36)
new = numpy.base_repr(new, 36)
number = new
hasCompleted = 1
treeSearch()
Here is my output:
111
1111
11111
111111
1111111
11111111
111111111
1111111111
11111111111
111111111111
1111111111111
11111111111111
111111111111111
1111111111111111
1111111111111112
1111111111111113
1111111111111114
1111111111111115
1111111111111116
1111111111111117
1111111111111118
1111111111111119
111111111111111A
111111111111111B
111111111111111C
111111111111111D
111111111111111E
111111111111111F
111111111111111G
111111111111111H
111111111111111I
111111111111111J
111111111111111K
111111111111111L
111111111111111M
111111111111111N
111111111111111O
111111111111111P
111111111111111Q
111111111111111R
111111111111111S
111111111111111T
111111111111111U
111111111111111V
111111111111111W
111111111111111X
111111111111111Y
111111111111111Z
1111111111111120
1111111111111121
6316397706306666889217
11111111110QR5T
11111111110QR5U
11111111110QR5V
11111111110QR5W
11111111110QR5X
11111111110QR5Y
11111111110QR5Z
11111111110QR60
11111111110QR61
11111111110QR62
11111111110QR63
11111111110QR64
11111111110QR65
11111111110QR66
11111111110QR67
11111111110QR68
11111111110QR69
11111111110QR6A
11111111110QR6B
11111111110QR6C
11111111110QR6D
11111111110QR6E
11111111110QR6F
11111111110QR6G
11111111110QR6H
11111111110QR6I
11111111110QR6J
11111111110QR6K
11111111110QR6L
11111111110QR6M
11111111110QR6N
11111111110QR6O
11111111110QR6P
11111111110QR6Q
11111111110QR6R
11111111110QR6S
11111111110QR6T
11111111110QR6U
175455491841851850753
11111111110L4X
11111111110L4Y
11111111110L4Z
11111111110L50
11111111110L51
11111111110L52
11111111110L53
11111111110L54
11111111110L55
11111111110L56
11111111110L57
11111111110L58
11111111110L59
11111111110L5A
11111111110L5B
11111111110L5C
11111111110L5D
11111111110L5E
11111111110L5F
11111111110L5G
11111111110L5H
11111111110L5I
11111111110L5J
11111111110L5K
11111111110L5L
11111111110L5M
11111111110L5N
11111111110L5O
11111111110L5P
11111111110L5Q
11111111110L5R
11111111110L5S
11111111110L5T
11111111110L5U
11111111110L5V
11111111110L5W
11111111110L5X
11111111110L5Y
4873763662273662977
11111111110XT
11111111110XU
11111111110XV
11111111110XW
11111111110XX
11111111110XY
11111111110XZ
11111111110Y0
11111111110Y1
11111111110Y2
11111111110Y3
11111111110Y4
11111111110Y5
11111111110Y6
11111111110Y7
11111111110Y8
11111111110Y9
11111111110YA
11111111110YB
11111111110YC
11111111110YD
11111111110YE
11111111110YF
11111111110YG
11111111110YH
11111111110YI
11111111110YJ
11111111110YK
11111111110YL
11111111110YM
11111111110YN
11111111110YO
11111111110YP
11111111110YQ
11111111110YR
11111111110YS
11111111110YT
11111111110YU
135382323952046193
11111111110X
11111111110Y
11111111110Z
111111111110
111111111111
111111111121
111111111122
111111111123
111111111124
111111111125
111111111126
111111111127
111111111128
111111111129
11111111112A
11111111112B
11111111112C
11111111112D
11111111112E
11111111112F
11111111112G
11111111112H
11111111112I
11111111112J
11111111112K
11111111112L
11111111112M
11111111112N
11111111112O
11111111112P
11111111112Q
11111111112R
11111111112S
11111111112T
11111111112U
11111111112V
11111111112W
11111111112X
11111111112Y
11111111112Z
111111111130
111111111131
3760620109779064
11111111114
111111111141
111111111142
111111111143
111111111144
111111111145
111111111146
111111111147
111111111148
111111111149
11111111114A
11111111114B
11111111114C
11111111114D
11111111114E
11111111114F
11111111114G
11111111114H
11111111114I
11111111114J
11111111114K
11111111114L
11111111114M
11111111114N
11111111114O
11111111114P
11111111114Q
11111111114R
11111111114S
11111111114T
11111111114U
11111111114V
11111111114W
11111111114X
11111111114Y
3760620109779066
11111111116
11111111117
11111111118
11111111119
1111111111A
1111111111B
1111111111C
1111111111D
1111111111E
1111111111F
1111111111G
1111111111H
1111111111I
1111111111J
1111111111K
1111111111L
1111111111M
1111111111N
1111111111O
1111111111P
1111111111Q
1111111111R
1111111111S
1111111111T
1111111111U
1111111111V
1111111111W
1111111111X
1111111111Y
1111111111Z
11111111120
11111111121
11111111122
11111111123
11111111124
11111111125
11111111126
11111111127
104461669716087
1111111113
1111111114
1111111115
1111111116
1111111117
1111111118
1111111119
111111111A
111111111B
111111111C
111111111D
111111111E
111111111F
111111111G
111111111H
111111111I
111111111J
111111111K
111111111L
111111111M
111111111N
111111111O
111111111P
111111111Q
111111111R
111111111S
111111111T
111111111U
111111111V
111111111W
111111111X
111111111Y
111111111Z
1111111120
1111111121
1111111122
1111111123
1111111124
2901713047671
111111113
111111114
111111115
111111116
111111117
111111118
111111119
11111111A
11111111B
11111111C
11111111D
11111111E
11111111F
11111111G
11111111H
11111111I
11111111J
11111111K
11111111L
11111111M
11111111N
11111111O
11111111P
11111111Q
11111111R
11111111S
11111111T
11111111U
11111111V
11111111W
11111111X
11111111Y
11111111Z
111111120
111111121
111111122
111111123
111111124
80603140215
11111113
11111114
11111115
11111116
11111117
11111118
11111119
1111111A
1111111B
1111111C
1111111D
1111111E
1111111F
1111111G
1111111H
1111111I
1111111J
1111111K
1111111L
1111111M
1111111N
1111111O
1111111P
1111111Q
1111111R
1111111S
1111111T
1111111U
1111111V
1111111W
1111111X
1111111Y
1111111Z
11111120
11111121
11111122
11111123
11111124
2238976119
1111113
11111131
11111132
11111133
11111134
11111135
11111136
11111137
11111138
11111139
1111113A
1111113B
1111113C
1111113D
1111113E
1111113F
1111113G
1111113H
1111113I
1111113J
1111113K
1111113L
1111113M
1111113N
1111113O
1111113P
1111113Q
1111113R
1111113S
1111113T
1111113U
1111113V
1111113W
1111113X
1111113Y
1111113Z
11111140
11111141
2238976121
1111115
11111151
11111152
11111153
11111154
11111155
11111156
11111157
11111158
11111159
1111115A
1111115B
1111115C
1111115D
1111115E
1111115F
1111115G
1111115H
1111115I
1111115J
1111115K
1111115L
1111115M
1111115N
1111115O
1111115P
1111115Q
1111115R
1111115S
1111115T
1111115U
1111115V
1111115W
1111115X
1111115Y
1111115Z
11111160
11111161
2238976123
1111117
11111171
11111172
11111173
11111174
11111175
11111176
11111177
11111178
11111179
1111117A
1111117B
1111117C
1111117D
1111117E
1111117F
1111117G
1111117H
1111117I
1111117J
1111117K
1111117L
1111117M
1111117N
1111117O
1111117P
1111117Q
1111117R
1111117S
1111117T
1111117U
1111117V
1111117W
1111117X
1111117Y
1111117Z
11111180
11111181
2238976125
1111119
111111A
111111B
111111C
111111D
111111E
111111F
111111G
111111H
111111I
111111J
111111K
111111L
111111M
111111N
111111O
111111P
111111Q
111111R
111111S
111111T
111111U
111111V
111111W
111111X
111111Y
111111Z
1111120
1111121
1111122
1111123
1111124
1111125
1111126
1111127
1111128
1111129
111112A
62193783
111113
1111131
1111132
1111133
1111134
1111135
1111136
1111137
1111138
1111139
111113A
111113B
111113C
111113D
111113E
111113F
111113G
111113H
111113I
111113J
111113K
111113L
111113M
111113N
111113O
111113P
111113Q
111113R
111113S
111113T
111113U
111113V
111113W
111113X
111113Y
111113Z
1111140
1111141
62193785
111115
My code traverses in only deeper once then comes out, I will keep working on my code however I am hoping to find an easier solution or possibly a library that can perform this style of search. Thanks!

Python unified diff with line numbers from both "files"

I'm trying to figure out a way to create unified diffs with line numbers only showing N lines of context. I have been unable to do this with difflib.unified_diff. I need to show changes in both files.
The closest I can come is using diff on the command line like so:
/usr/bin/diff
--unchanged-line-format=' %.2dn %L'
--old-line-format="-%.2dn %L"
--new-line-format="+%.2dn %L"
file1.py
file2.py
BUT I only want to show N lines of context, and /usr/bin/diff doesn't seem to support context with a custom line format (eg. -U2 is not compatible with --line-format "conflicting output style options").
Below is an example of what I'd like to accomplish (the same output as the above diff, but only showing 1 line of context surrounding changes):
+01: def renamed_function()
-01: def original_function():
02:
+03: """ Neat stuff here """
04:
21:
+22: # Here's a new comment
23:
85: # Output the value of foo()
+86: print "Foo is %s"%(foo())
-86: print foo()
87:

I was able to figure out something very close to what I wanted to do. It's slower than regular diff, though. Here's the entire code, from my project GitGate.
def unified_diff(to_file_path, from_file_path, context=1):
""" Returns a list of differences between two files based
on some context. This is probably over-complicated. """
pat_diff = re.compile(r'## (.[0-9]+\,[0-9]+) (.[0-9]+,[0-9]+) ##')
from_lines = []
if os.path.exists(from_file_path):
from_fh = open(from_file_path,'r')
from_lines = from_fh.readlines()
from_fh.close()
to_lines = []
if os.path.exists(to_file_path):
to_fh = open(to_file_path,'r')
to_lines = to_fh.readlines()
to_fh.close()
diff_lines = []
lines = difflib.unified_diff(to_lines, from_lines, n=context)
for line in lines:
if line.startswith('--') or line.startswith('++'):
continue
m = pat_diff.match(line)
if m:
left = m.group(1)
right = m.group(2)
lstart = left.split(',')[0][1:]
rstart = right.split(',')[0][1:]
diff_lines.append("## %s %s ##\n"%(left, right))
to_lnum = int(lstart)
from_lnum = int(rstart)
continue
code = line[0]
lnum = from_lnum
if code == '-':
lnum = to_lnum
diff_lines.append("%s%.4d: %s"%(code, lnum, line[1:]))
if code == '-':
to_lnum += 1
elif code == '+':
from_lnum += 1
else:
to_lnum += 1
from_lnum += 1
return diff_lines

Parse svg:path d attribute

I need to decipher a path element in an SVG document to drive a CNC machine along that path. I wonder if there are any Python libraries that parse SVG and give some sort of pythonic list for the d attribute, e.g.:
<path d="M 20 30 L 20 20 20 40 40 40"/>
parses into
[["M", 20, 30],
["L", 20, 20],
["L", 20, 40],
["L", 40, 40]]

Here's a start it's written by me and in python 2.7.2. Just delete the tests and print statements if you want to.
Copyright 2012 Christopher L. Ramsey
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
from collections import OrderedDict
from re import match
from re import split
from re import sub
class PathIterator(object):
EOI = 'End of Iteration'
PATH_IDENTIFIERS = r'[MLHVCSQTAmlhvcsqa]'
NUMBERS = r'[0-9.-^A-z]'
SEPERATORS = r'\s|\,'
PATH_END = r'[Zz]'
def __init__(self, path):
self.parseable = path.translate(None, '\t\f')
self.parseable = self.parseable.replace('\n', ' ')
print 'strip_newlines: {}'.format(self.parseable)
self.parseable = sub(r'([A-Za-z])([0-9]|\-)', self.insert, self.parseable)
print 'add_space: {}'.format(self.parseable)
self.parseable = self.parseable.replace(',', ' ')
print 'replace_commas: {}'.format(self.parseable)
self.parseable = sub(r'\s+', ' ', self.parseable) # replace any double space with a single space
print 'strip_extra_space: {}'.format(self.parseable)
self.tokens = split(' ', self.parseable)
self.map = self.produce_map(self.tokens)
print self.map
self.elements = self.process(self.map)
def produce_map(self, tkns):
self.m = OrderedDict()
self.i = 0
while self.i < len(tkns):
if match(self.PATH_IDENTIFIERS, tkns[self.i]):
self.m[self.i] = tkns[self.i]
elif match(self.PATH_END, tkns[self.i]):
self.m[self.i] = tkns[self.i]
else:
pass
self.i += 1
return self.m.items()
def process(self, map):
self.mm = []
self.l = len(map)
for e in range(self.l):
try:
self.element = map[e]
self.future = map[e + 1]
self.ident = self.element[1]
self.start = self.element[0] + 1
self.end = self.future[0]
self.nbrs = self.tokens[self.start:self.end]
except:
self.element = map[e]
self.ident = self.element[1]
self.start = self.element[0] + 1
self.end = len(self.tokens)
self.nbrs = self.tokens[self.start:self.end]
print 'start: {} end {}'.format(self.start, self.end)
finally:
self.numbers = []
for number in self.nbrs:
self.numbers.append(float(number))
self.mm.append((self.ident, self.numbers))
return iter(self.mm)
def next(self):
try:
return self.elements.next()
except:
return self.EOI
def insert(self, match_obj):
self.group = match_obj.group()
return '{} {}'.format(self.group[0], self.group[1])
if __name__ == '__main__':
inkscape_path = "M 12,90 C 8.676,90 6,87.324 6,84 L 6,82 6,14 6,12 c 0,-0.334721 0.04135,-0.6507 0.09375,-0.96875 0.0487,-0.295596 0.09704,-0.596915 0.1875,-0.875 C 6.29113,10.12587 6.302142,10.09265 6.3125,10.0625 6.411365,9.774729 6.5473802,9.515048 6.6875,9.25 6.8320918,8.976493 7.0031161,8.714385 7.1875,8.46875 7.3718839,8.223115 7.5612765,7.995278 7.78125,7.78125 8.221197,7.353194 8.72416,6.966724 9.28125,6.6875 9.559795,6.547888 9.8547231,6.440553 10.15625,6.34375 9.9000482,6.443972 9.6695391,6.580022 9.4375,6.71875 c -0.00741,0.0044 -0.023866,-0.0045 -0.03125,0 -0.031933,0.0193 -0.062293,0.04251 -0.09375,0.0625 -0.120395,0.0767 -0.2310226,0.163513 -0.34375,0.25 -0.1061728,0.0808 -0.2132809,0.161112 -0.3125,0.25 C 8.4783201,7.442683 8.3087904,7.626638 8.15625,7.8125 8.0486711,7.942755 7.9378561,8.077785 7.84375,8.21875 7.818661,8.25713 7.805304,8.30462 7.78125,8.34375 7.716487,8.446782 7.6510225,8.548267 7.59375,8.65625 7.4927417,8.850956 7.3880752,9.071951 7.3125,9.28125 7.30454,9.30306 7.288911,9.3218 7.28125,9.34375 7.2494249,9.4357 7.2454455,9.530581 7.21875,9.625 7.1884177,9.731618 7.1483606,9.828031 7.125,9.9375 7.0521214,10.279012 7,10.635705 7,11 l 0,2 0,68 0,2 c 0,2.781848 2.2181517,5 5,5 l 2,0 68,0 2,0 c 2.781848,0 5,-2.218152 5,-5 l 0,-2 0,-68 0,-2 C 89,10.635705 88.94788,10.279012 88.875,9.9375 88.83085,9.730607 88.78662,9.539842 88.71875,9.34375 88.71105,9.3218 88.69545,9.30306 88.6875,9.28125 88.62476,9.107511 88.549117,8.913801 88.46875,8.75 88.42717,8.6672 88.38971,8.580046 88.34375,8.5 88.28915,8.40279 88.216976,8.31165 88.15625,8.21875 88.06214,8.077785 87.951329,7.942755 87.84375,7.8125 87.700576,7.63805 87.540609,7.465502 87.375,7.3125 87.36383,7.3023 87.35502,7.29135 87.34375,7.28125 87.205364,7.155694 87.058659,7.046814 86.90625,6.9375 86.803679,6.86435 86.701932,6.784136 86.59375,6.71875 c -0.0074,-0.0045 -0.02384,0.0044 -0.03125,0 -0.232039,-0.138728 -0.462548,-0.274778 -0.71875,-0.375 0.301527,0.0968 0.596455,0.204138 0.875,0.34375 0.55709,0.279224 1.060053,0.665694 1.5,1.09375 0.219973,0.214028 0.409366,0.441865 0.59375,0.6875 0.184384,0.245635 0.355408,0.507743 0.5,0.78125 0.14012,0.265048 0.276135,0.524729 0.375,0.8125 0.01041,0.03078 0.02133,0.06274 0.03125,0.09375 0.09046,0.278085 0.1388,0.579404 0.1875,0.875 C 89.95865,11.3493 90,11.665279 90,12 l 0,2 0,68 0,2 c 0,3.324 -2.676,6 -6,6 l -72,0 z"
mdn_path = "M10 80 Q 52.5 10, 95 80 T 180 80"
w3c_path = "M100,200 C100,100 250,100 250,200 S400,300 400,200"
w3c_path_neg = "M-100,200 C100,100 250,100 250,200 S-400,300 400,200"
w3c_path_nl = '''
M600,350 l 50,-25
a25,25 -30 0,1 50,-25 l 50,-25
a25,50 -30 0,1 50,-25 l 50,-25
a25,75 -30 0,1 50,-25 l 50,-25
a25,100 -30 0,1 50,-25 l 50,-25
'''
paths = [inkscape_path, mdn_path, w3c_path, str.strip(w3c_path_nl), w3c_path_neg]
for path in paths:
p = PathIterator(path)
char = ''
while char != PathIterator.EOI:
char = p.next()
print char

Getting the d-string can be down in a couple lines using svgpathtools, the rest can be done using regular expressions.
from svgpathtools import svg2paths
paths, attributes = svg2paths('some_svg_file.svg')
paths is a list of svgpathtools Path objects (containing just the curve info, no colors, styles, etc.).
attributes is a list of dictionary objects of the attributes.
Suppose the path you are interested in is the first (the 0th) listed in your SVG, then to extract just the d-string you can use:
d = attributes[0]['d'] # d-string from first path in SVG
# Now for some regular expressions magic
import re
split_by_letters = re.findall('[A-Z|a-z][^A-Z|a-z]*', d)
split_as_you_want = []
for x in split_by_letters:
nums = x[1:].replace(',',' ').split() # list of numbers after letter
for k in range(len(nums) // 2):
split_as_you_want.append([x[0]] + [nums[k]] + [nums[k+1]])
print split_as_you_want
I didn't convert the numbers into strings here as how you want to do that depends on whether they're always integers and whether you care they stay that way. For most purposes this can be done with something like the following right below the "nums = ..." line.
for k, n in enumerate(nums):
try:
nums[k] = int(n)
except ValueError:
nums[k] = float(n)

load parameters from a file in Python

I am writing a Python class to model a process and I want to initialized the parameters from a file, say 'input.dat'. The format of the input file looks like this.
'input.dat' file:
Z0: 0 0
k: 0.1
g: 1
Delta: 20
t_end: 300
The code I wrote is the following. It works but appears redundant and inflexible. Is there a better way to do the job? Such as a loop to do readline() and then match the keyword?
def load(self,filename="input.dat"):
FILE = open(filename)
s = FILE.readline().split()
if len(s) is 3:
self.z0 = [float(s[1]),float(s[2])] # initial state
s = FILE.readline().split()
if len(s) is 2:
self.k = float(s[1]) # kappa
s = FILE.readline().split()
if len(s) is 2:
self.g = float(s[1])
s = FILE.readline().split()
if len(s) is 2:
self.D = float(s[1]) # Delta
s = FILE.readline().split()
if len(s) is 2:
self.T = float(s[1]) # end time

Assuming the params are coming from a safe place (made by you or users, not the internet), just make the parameters file a Python file, params.py:
Z0 = (0, 0)
k = 0.1
g = 1
Delta = 20
t_end = 300
Then in your code all you need is:
import params
fancy_calculation(10, k=params.k, delta=params.Delta)
The beauty of this is two-fold: 1) simplicity, and 2) you can use the power of Python in your parameter descriptions -- particularly useful here, for example:
k = 0.1
Delta = 20
g = 3 * k + Delta
Alternatively, you could use Python's built-in JSON or ConfigParser .INI parser modules.

If you are open to some other kind of file where you can keep your parameters, I would suggest you to use a YAML file.
The Python library is PyYAML. This is how you can easily use it with Python.
For a better introduction, look at this Wikipedia article: http://en.wikipedia.org/wiki/YAML.
The benefit is you can read the parameter values as lists or maps.
You would love it!

Try the following:
def load(self, filename="input.dat"):
d = {"Z0": "z0", "k": "k", "g": "g", "Delta": "D", "t_end": "T"}
FILE = open(filename)
for line in FILE:
name, value = line.split(":")
value = value.strip()
if " " in value:
value = map(float, value.split())
else:
value = float(value)
setattr(self, d[name], value)
Proof that it works:
>>> class A(object): pass
...
>>> a = A()
>>> load(a)
>>> a.__dict__
{'k': 0.10000000000000001, 'z0': [0.0, 0.0], 'D': 20.0, 'g': 1.0, 'T': 300.0}

As others have mentioned, in Python you can create object attributes dynamically "on the fly". That means you could do something like the following to create Params objects as they're read-in. I've tried to make the code as data-driven as possible, so relatively flexible.
# maps label to attribute name and types
label_attr_map = {
"Z0:": ["z0", float, float],
"k:": [ "k", float],
"g:": [ "g", float],
"Delta:": [ "D", float],
"t_end:": [ "T", float]
}
class Params(object):
def __init__(self, input_file_name):
with open(input_file_name, 'r') as input_file:
for line in input_file:
row = line.split()
label = row[0]
data = row[1:] # rest of row is data list
attr = label_attr_map[label][0]
datatypes = label_attr_map[label][1:]
values = [(datatypes[i](data[i])) for i in range(len(data))]
self.__dict__[attr] = values if len(values) > 1 else values[0]
params = Params('input.dat')
print 'params.z0:', params.z0
print 'params.k:', params.k
print 'params.g:', params.g
print 'params.D:', params.D
print 'params.T:', params.T
Output:
params.z0: [0.0, 0.0]
params.k: 0.1
params.g: 1.0
params.D: 20.0
params.T: 300.0

Perhaps this might give you what you need:
def load(self,filename='input.dat'):
with open(filename) as fh:
for line in fh:
s = line.split()
if len(s) == 2:
setattr(self,s[1],s[2])
elif len(s) == 3:
setattr(self,s[1],s[2:])
I also didn't include any error checking, but setattr is very handy.

Something like this:
def load(self,filename="input.dat"):
# maps names to number of fields they need
# only necessary for variables with more than 1 field
argmap = dict(Z0=2)
# maps config file names to their attribute names on the object
# if name is the same both places, no need
namemap = dict(Z0="z0", Delta="D", t_end="T")
with open(filename) as FILE:
for line in FILE:
s = line.split()
var = s[0].rstrip(":")
try:
val = [float(x) for x in s[1:]]
except ValueError:
continue
if len(val) == varmap.get(var, 1):
if len(val) == 1:
val = val[0]
setattr(self, namemap.get(var, var), val)

Python objects have a built-in __dict__ member. You can modify it, and then refer to properties as obj.key.
class Data(object):
def __init__(self, path='infile.dat'):
with open(path, 'r') as fo:
for line in fo.readlines():
if len(line) < 2: continue
parts = [s.strip(' :\n') for s in line.split(' ', 1)]
numbers = [float(s) for s in parts[1].split()]
# This is optional... do you want single values to be stored in lists?
if len(numbers) == 1: numbers = numbers[0]
self.__dict__[parts[0]] = numbers
# print parts -- debug
obj = Data('infile.dat')
print obj.g
print obj.Delta
print obj.Z0
At the end of this, we print out a few of the keys. Here's the output of those.
1.0
20.0
[0.0, 0.0]
For consistency, you can remove the line marked "optional" in my code, and have all objects in lists -- regardless of how many elements they have. That will make using them quite a bit easier, because you never have to worry about obj.g[0] returning an error.

Here's another one
def splitstrip(s):
return s.split(':')[1].strip()
with open('input.dat','r') as f:
a.z0 = [float(x) for x in splitstrip(f.readline()).split(' ')]
a.k, a.g, a.D, a.T = tuple([float(splitstrip(x)) for x in f.read().rstrip().split('\n')])
;)

Python: File formatting

I have a for loop which references a dictionary and prints out the value associated with the key. Code is below:
for i in data:
if i in dict:
print dict[i],
How would i format the output so a new line is created every 60 characters? and with the character count along the side for example:
0001
MRQLLLISDLDNTWVGDQQALEHLQEYLGDRRGNFYLAYATGRSYHSARELQKQVGLMEP
0061
DYWLTAVGSEIYHPEGLDQHWADYLSEHWQRDILQAIADGFEALKPQSPLEQNPWKISYH
0121 LDPQACPTVIDQLTEMLKETGIPVQVIFSSGKDVDLLPQRSNKGNATQYLQQHLAMEPSQ

It's a finicky formatting problem, but I think the following code:
import sys
class EveryN(object):
def __init__(self, n, outs):
self.n = n # chars/line
self.outs = outs # output stream
self.numo = 1 # next tag to write
self.tll = 0 # tot chars on this line
def write(self, s):
while True:
if self.tll == 0: # start of line: emit tag
self.outs.write('%4.4d ' % self.numo)
self.numo += self.n
# wite up to N chars/line, no more
numw = min(len(s), self.n - self.tll)
self.outs.write(s[:numw])
self.tll += numw
if self.tll >= self.n:
self.tll = 0
self.outs.write('\n')
s = s[numw:]
if not s: break
if __name__ == '__main__':
sys.stdout = EveryN(60, sys.stdout)
for i, a in enumerate('abcdefgh'):
print a*(5+ i*5),
shows how to do it -- the output when running for demonstration purposes as the main script (five a's, ten b's, etc, with spaces in-between) is:
0001 aaaaa bbbbbbbbbb ccccccccccccccc dddddddddddddddddddd eeeeee
0061 eeeeeeeeeeeeeeeeeee ffffffffffffffffffffffffffffff ggggggggg
0121 gggggggggggggggggggggggggg hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
0181 hhhhhhh

# test data
data = range(10)
the_dict = dict((i, str(i)*200) for i in range( 10 ))
# your loops as a generator
lines = ( the_dict[i] for i in data if i in the_dict )
def format( line ):
def splitter():
k = 0
while True:
r = line[k:k+60] # take a 60 char block
if r: # if there are any chars left
yield "%04d %s" % (k+1, r) # format them
else:
break
k += 60
return '\n'.join(splitter()) # join all the numbered blocks
for line in lines:
print format(line)

I haven't tested it on actual data, but I believe the code below would do the job. It first builds up the whole string, then outputs it a 60-character line at a time. It uses the three-argument version of range() to count by 60.
s = ''.join(dict[i] for i in data if i in dict)
for i in range(0, len(s), 60):
print '%04d %s' % (i+1, s[i:i+60])

It seems like you're looking for textwrap
The textwrap module provides two convenience functions, wrap() and
fill(), as well as TextWrapper, the class that does all the work, and
a utility function dedent(). If you’re just wrapping or filling one or
two text strings, the convenience functions should be good enough;
otherwise, you should use an instance of TextWrapper for efficiency.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Doing operations on a large data set - python

Related

Attempting to Traverse a search API Collecting results in Python 3.9

Python unified diff with line numbers from both "files"

Parse svg:path d attribute

load parameters from a file in Python

Python: File formatting

Categories

Resources