def function(file_count,results):#results contain r.name and r.result
args = []
for r in engine_results:
args.append(r.name)
table_line = ""
for x in range (file_count):
table_line += "".join([
"<tr><td>Ex",
str(x + 1),
"</td> <td>",
"</td> <td>".join([args[x] in args]),
"</td> <td></tr>"]);
return table_line
I want to print out for example
"<tr><td>"Ex1"</td><td></td><td>" A B C D E "</td> <td></tr>"
A B C D E being the contetnt of args, this method didn't work nor a previous one i tested column[x] in args where it just gave me the first letter .
I think I understand what you are asking. This is what I did.:
def function(file_count, results):
# Get a string that is each item in result with a space between
arg_string = ' '.join([str(i) for i in results])
# Set up templates
template_1 = '"<tr><td>"'
template_2 = '"</td><td></td><td>"'
template_3 = '"</td> <td></tr>"'
# Create a variable to hold your lines
lines = []
# For every file in the range
for i in range(0, file_count):
# Make the Ex0, Ex1, Ex2, etc. string
ex = 'Ex{}'.format(i)
# Join all the templates and the ex variable and the arg_string
result = ''.join([template_1, ex, template_2, arg_string, template_3])
# Append the result to lines
lines.append(result)
# Return every item in lines joined by a newline
return '\n'.join(lines)
Related
Essentially, I am creating a count by using a dictionary and everytime it sees a "1" in the text file, it adds one to the array.However, I keep getting an error
Letters = ["A","B,"C","D","E","F"]
d= {}
d["A"] = [0]
d["B"] = [0]
d["C"] = [0]
d["D"] = [0]
d["E"] = [0]
file = open('test1.txt','r')
for line in file:
line_array = line.strip("\n").split(",")
for x in range(5):
if line_array[x] == "1":
for y in Letters:
d[y][0] = d[y][0] + 1
BTW, the text file is formatted like this;
1,0,3,0,2
0,2,1,0,3
ETC
EDIT sorry, misworded
You never actually use your dictionary.
Letters= ["A","B","C","D","E"]
d= {key: 0 for key in Letters}
print(Letters)
file = open('test1.txt','r')
for line in file:
line_array = line.strip("\n").split(",")
for x in range(5):
if line_array[x] == "1":
for i, value in enumerate(Letters):
if i == x:
d[value] = d[value] + 1
#print(candidatescores) # No idea where this comes from
I have a txt file with the following tuple format
ABC-01 name1,10
DEF-02 name2,11
GHI-03 name3,12
JKH-04 name4,13
I may not be able to use import re. Need to do without re.
I need to split the tuples at the delimiters(ABC-01 and others are one word and I need to keep the hyphen). My output needs to be as follows
Format of the needed result
Out[]:
[(u'name1', u'ABC-01 10'),
(u'name2', u'DEF-02 11'),
(u'name3', u'GHI-03 12 '),
(u'name4', u'JKL-04 13')]
Here's what I have tried till now and the output I get
Solution 1:
def split_func(line):
line_mod = line.split(' ')
line_mod1 = line_mod.split(',')
print line_mod1
Result
Attribute Error : list object has no attribute split
Solution 2:
def split_func(line):
line_mod = line.split(' ')
a,b,c = str(line_mod).split(',')
return (b,a + " " + c)
Result
[(" u'name1", "[u'ABC-01' 10]"),
(" u'name2", "[u'DEF-02' 11]"),
(" u'name3", "[u'GHI-03' 12]"),
(" u'name4", "[u'JKL-04' 13]")]
How can I get the exact format that I am trying to get?
Here is a re example below.
import re
def main():
result = []
with open("test.txt") as f:
for line in f:
result.append(split_func(line.strip()))
print(result)
def split_func(line):
a, b, c = re.split("\s|,+", line)
return b, a + " " + c
if __name__ == '__main__':
main()
OR
Here is one without re
def main():
result = []
with open("test.txt") as f:
for line in f:
result.append(split_func(line.strip()))
print(result)
def split_func(line):
a, b = line.split(' ')
b, c = b.split(',')
return b, a + " " + c
if __name__ == '__main__':
main()
With the output looking like this
[('name1', 'ABC-01 10'), ('name2', 'DEF-02 11'), ('name3', 'GHI-03 12'), ('name4', 'JKH-04 13')]
You can do something like
def split_func(line):
a, b = line.split(' ')
c, d = b.split(',')
return c, ' '.join([a, d])
Your Solution 1 does not work because split() returns a list and you can't use split() on a list.
For Solution2
x = ['ab', 'cd']
str(x) gives "['ab', 'cd']"
What you need is join() function.
I am trying to alter an existing ASCII data file in a specific way.
The way I would like to go is to find find either one string from an array, which I define beforehand.
If this string is found in the file I would like to change the preceding entry; the string to put in here depends on which of the strings is found in the first place.
I have a file, where the entrys are separated by spaces and I have trailing spaces at the end to fill up 30 columns. The respective strings would not be in the first line and there would never be more than one per line. An example could look like this:
test01out.txt:
a0997 b0998 c0999
a1000 b1001 c1002
a1003 b1004 c1005
a1006 a1000 c1007
a1008 b1009 c1010
b1001 b1011 c1012
a1013 b1014 b1001
a1015 b1016 c1017
The file does not necessarily have to have three columns in a row. It is possible, that a row has only two but can also have four or five columns.
My current attempt was the following:
from numpy import *
findlines = open("test01.txt").read().split("\n")
searcharray = array(["a1000","b1001"])
alterarray = array(["this1","this2"])
tempstring_current = ""
fileout = open("test01out.txt", "w")
for i, line in enumerate(findlines):
tempstring_last = tempstring_current
tempstring_current = line.rstrip().split(" "))
if any(x in tempstring_current for x in searcharray): # check if one of the elements is in the current line -> unfortunately this seems to be true for any line checked...
print(i)
print(tempstring_current)
for j, element in enumerate(tempstring_current):
if any(searcharray == tempstring_current):
currentsearchindex = argmax(searcharray == tempstring_current)
currentalterstring = alterarray[currentsearchindex]
if currentsearchindex == 0:
tempstring_last.split(" ")[-1] = currentalterstring
else:
tempstring_current.split(" ")[currentsearchindex - 1] = currentalterstring
tempstring_current.split(" ")[currentsearchindex-1] = "XPRZeugs_towrite" + repr(currentdesignatedspeed)
tempstring_last = tempstring_last.ljust(30)
try:
fileout.write(str(tempstring_last))
fileout.write("\r")
try:
fileout.close()
searcharray and alterarray would have some more elements, than two.
I have tested the script up to the any condition; unfortunately the any conditions seems to be met always for some reason I do not quite understand:
from numpy import *
findlines = open("test01.txt").read().split("\n")
searcharray = array(["a1000","b1001"])
alterarray = array(["this1","this2"])
tempstring_current = ""
fileout = open("test01out.txt", "w")
for i, line in enumerate(findlines):
tempstring_last = tempstring_current
tempstring_current = line.rstrip().split(" ")
if any(x in tempstring_current for x in searcharray): # check if one of the elements is in the current line -> unfortunately this seems to be true for any line checked...
print(i)
print(tempstring_current)
I get the lines printed for every line in the file, which I did not expect.
Edit/Solution:
I realized I made a mistake in the input testfile:
It should look like this:
a0997 b0998 c0999
a1000 b1001 c1001
a1003 b1004 c1005
a1006 a1000 c1007
a1008 b1009 c1010
c1002 b1011 c1012
a1013 b1014 c1002
a1015 b1016 c1017
The full code doing the job is the following:
from numpy import *
findlines = open("test01.txt").read().split("\n")
searcharray = array(["a1000","c1002"])
alterarray = array(["this1","this2"])
tempstring_current = ""
fileout = open("test01out.txt", "w")
for i, line in enumerate(findlines):
tempstring_last = tempstring_current
tempstring_current = line.rstrip().split(" ")
if any([x in tempstring_current for x in searcharray]): # check if one of the elements is in the current line -> unfortunately this seems to be true for any line checked...
# print(i)
# print(tempstring_current)
# print(searcharray)
# print([x in tempstring_current for x in searcharray])
# print(argmax([x in tempstring_current for x in searcharray]))
currentsearchposindex = argmax([x in tempstring_current for x in searcharray]) # welchen Index hat das entsprechende Element im Searcharray?
currentalterstring = alterarray[currentsearchposindex] # was ist der entsprechende Eintrag im Alterarray
for j, currentXPRelement in enumerate(tempstring_current):
if currentXPRelement == searcharray[currentsearchposindex]:
currentsearchindex_intemparray = j
# print(len(tempstring_current))
# print(searcharray[currentsearchposindex])
# print(tempstring_current == searcharray[currentsearchposindex])
# print(searcharray[currentsearchposindex] == tempstring_current)
# print(argmax(tempstring_current == searcharray[currentsearchposindex]))
# currentsearchindex_intemparray = argmax(tempstring_current == searcharray[currentsearchposindex])
if currentsearchindex_intemparray == 0:
tempstring_last[-1] = currentalterstring
else:
tempstring_current[currentsearchindex_intemparray - 1] = currentalterstring
# tempstring_current[currentsearchindex_intemparray-1] = "XPRZeugs_towrite" + repr(currentalterstring)
tempstring_last = str(" ".join(tempstring_last)).ljust(30)
if not i == 0:
try:
fileout.write(str(tempstring_last))
fileout.write("\r")
finally:
None
try:
fileout.write(" ".join(tempstring_current))
fileout.write("\r")
fileout.close()
finally:
None
To fix your code so at least it can fail to always match, change
if any(x in tempstring_current for x in searcharray):
to
if any([x in tempstring_current for x in searcharray]):
I think the reason is that the 'x in tempstring_current for x in searcharray' expression returns an interator function - any() says 'this value (i.e. the iterator function reference) is not None so it is True', so the result is always True. The changed syntax creates a list from the iterator and then any works as you probably wanted, i.e. it returns true if any element in the list is true.
I am trying to get the range of numbers between two dotted numbers, like 2.1.0 and 2.1.3.
My requirement is that the first two numbers need to be the same (so not 2.1.0 to 2.2.0)
What I want to get out is:
['2.1.0', '2.1.1', '2.1.2', '2.1.3']
Here is what I have tried, and it works, but I want to know if there is a better way to do it.
start = "2.1.0"
end = "2.1.3"
def get_dotted_range(start,end):
start_parts = start.split(".")
end_parts = end.split(".")
# ensure the versions have the same number of dotted sections
if len(start_parts) != len(end_parts):
return False
# ensure first 2 numbers are the same
for i in range(0,len(start_parts[:-1])):
if start_parts[i] != end_parts[i]:
# part is different betwen start and end!
return False
new_parts = []
# ensure last digit end is higher than start
if int(end_parts[-1]) >= int(start_parts[-1]):
# append the version to the return list
for i in range(int(start_parts[-1]),int(end_parts[-1]) + 1):
new_parts.append("%s.%s.%s" % (start_parts[0],start_parts[1],i))
else:
return False # end is lower than start
return new_parts
start = "2.1.0"
end = "2.1.3"
startFirst, startMiddle, startLast = map(int, start.split("."))
_, _, endLast = map(int, end.split("."))
dottedRange = [".".join(map(str, [startFirst, startMiddle, x]))
for x in range(startLast, 1+endLast)]
start = "2.1.0"
end = "2.1.3"
def get_dotted_range(start, end):
# break into number-pieces
start = start.split(".")
end = end .split(".")
# remove last number from each
x = int(start.pop())
y = int(end .pop())
# make sure start and end have the same number of sections
# and all but the last number is the same
if start != end:
return []
else:
head = ".".join(start) + "."
return [head + str(i) for i in range(x, y + 1)]
then
In [67]: get_dotted_range(start, end)
Out[67]: ['2.1.0', '2.1.1', '2.1.2', '2.1.3']
One way:
def get_dotted_range(start, end):
sparts = start.split('.')
eparts = end.split('.')
prefix = '.'.join(sparts[0:-1])
slast = int(sparts[-1])
elast = int(eparts[-1])
return [prefix + '.' + str(i) for i in range(slast, elast + 1)]
print(get_dotted_range('2.1.0', '2.1.3'))
print(get_dotted_range('2.1.9', '2.1.12'))
results in:
['2.1.0', '2.1.1', '2.1.2', '2.1.3']
['2.1.9', '2.1.10', '2.1.11', '2.1.12']
start = "2.1.0"
end = "2.1.3"
# split once to get last value
s_spl, e_spl = start.rsplit(".",1), end.rsplit(".",1)
# get first part of string to join up later
pre = s_spl[0]
# make sure first two parts are identical
if pre == e_spl[0]:
# loop in range from last element of start
# up to and including last element of end
out = ["{}.{}".format(pre, i) for i in range(int(s_spl[1]), int(e_spl[1]) + 1)]
print(out)
print(out)
['2.1.0', '2.1.1', '2.1.2', '2.1.3']
So in a function we would return a list or False:
def get_dotted_range(start,end):
s_spl, e_spl = start.rsplit(".", 1), end.rsplit(".", 1)
pre = s_spl[0]
if pre == e_spl[0]:
return ["{}.{}".format(pre, i) for i in range(int(s_spl[1]), int(e_spl[1])+1)]
return False
You should also consider the cases where a user enters incorrect data that cannot be cast to an int, the format is incorrect or they enter an empty string so you get an error indexing etc...
def get_dotted_range(start, end):
try:
s_spl, e_spl = start.rsplit(".", 1), end.rsplit(".", 1)
if s_spl[0] == e_spl[0]:
pre = s_spl[0]
return ["{}.{}".format(pre, i) for i in range(int(s_spl[1]), int(e_spl[1]) + 1)]
except ValueError as e:
return "{}: Digit expected".format(e)
except IndexError as e:
return "{}: Input format should be d.d.d ".format(e)
return False
There are other cases you may want to catch like when a user enters the start and end backwards which will end up returning an empty.
I have a file where each line is ordered alphabetically. The file is 12Gb, which means I can't simply read it line by line. The data looks like this:
brown 0 1 0 1 2
fox 3 5 0 0 1
jumped 2 0 6 1 0
The words at the beginning of each line are unique. The word and the numbers on each line are separated by tabs. I want to be able to query the file for specific keywords. For example, if I query "fox", the program should return "fox 3 5 0 0 1".
It seems that a good candidate for this would be the bisect module: https://docs.python.org/3.0/library/bisect.html
I found a post which uses bisect to find out the line number of a keyword: How do I perform binary search on a text file to search a keyword in python?
This is what the code looks like:
import bisect
import os
class Query(object):
def __init__(self, query, index=5):
self.query = query
self.index = index
def __lt__(self, comparable):
return self.query < comparable[self.index:]
class FileSearcher(object):
def __init__(self, file_pointer, record_size=35):
self.file_pointer = file_pointer
self.file_pointer.seek(0, os.SEEK_END)
self.record_size = record_size + len(os.linesep)
self.num_bytes = self.file_pointer.tell()
self.file_size = (self.num_bytes // self.record_size)
def __len__(self):
return self.file_size
def __getitem__(self, item):
self.file_pointer.seek(item * self.record_size)
return self.file_pointer.read(self.record_size)
with open('myfile') as file_to_search:
query = 'fox\t' #token to query
wrapped_query = Query(query)
searchable_file = FileSearcher(file_to_search)
linepos = bisect.bisect(searchable_file, wrapped_query)
print "Located # line: ", linepos
#print content of line?
However, I can't figure out how to actually print the content of the line. I should at least add a read statement somewhere, but I don't know where.
Is it possible to print the content of the line with the bisect module?
If you want go with Python solution, you can do the following:
Read file by small chunks of MAX_LINE bytes, each time moving forward by fixed offset
That offset determines block size
For each such read, determine the key (first word in a line)
These keys serve as delimiters of blocks
Construct the list of such keys. The list would be sorted as keys are ordered
You may persist such list somewhere via pickle/json.dumps/...
When quering, find via bisect the index of a block where you key is located
Read that block entirely and find the key with data
Here is the example file bigfile:
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7
The code:
import os
from bisect import bisect
MAX_LINE = 7
BLOCK_SIZE = 10
def parse_chunks(filename):
size = os.path.getsize(filename)
chunks = []
with open(filename, 'rb') as file:
block = str(file.read(MAX_LINE*2))
first_line = block[:block.find('\n') + 1]
chunks.append(first_line.split()[0])
pos = BLOCK_SIZE
while pos < size:
file.seek(pos)
block = str(file.read(MAX_LINE*2))
first_eol = block.find('\n')
second_eol = block.find('\n', first_eol + 1)
if first_eol == -1 or second_eol == -1:
break
line = block[first_eol + 1:second_eol]
key = line.split()[0]
chunks.append(key)
pos += BLOCK_SIZE
return chunks
if __name__ == '__main__':
BLOCK_SIZE = 10
filename = 'bigfile'
chunks = parse_chunks(filename)
query = 'abc'
pos_before = bisect(chunks, query) - 1
with open(filename, 'rb') as file:
file.seek(pos_before*BLOCK_SIZE)
block = str(file.read(BLOCK_SIZE + MAX_LINE))
line_start = block.find(query)
line_end = block.find('\n', line_start + 1)
line = block[line_start:line_end]
print(line)
In this toy example I use block size of 10 bytes, in your case of 12GB file I'd suggest you to start with 1M.
The following recursive function should be able to narrow the search interval. I'm not sure that you can modify it so that it returns a match or None for no match.
def bisearch(f, word, i, j)
if (j-1)<1E6: return i,j
k = (i+j)/2
f.seek(k)
while k<j:
c = f.read(1)
k = k+1
if c == '\n': break
else:
# ??? no match ??? I'm not sure
w = []
while 1:
c = f.read(1)
if c == '\t': break
w.append(c)
w = "".join(w)
if w == word:
return k, k
if w < word:
return bisearch(f, word, k, j)
else:
return bisearch(f, word, i, k)
and here an example of usage
word = ...
f = open(...)
i,j = bisearch(f, word, 0, len_f)
f.seek(i)
if i==j:
line = f.readline()
else:
#################### EDIT ################
# OLD
# buffer = f.read(1E6)
# NEW
buffer = f.read(j-i)
lenw = len(word)
for line in buffer.split('\n'):
if line[:lenw] == word: break
else:
# no matches, SOS
result = process(line)
Try seeking to the line in question and using readline.
print "Located # line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()
This is assuming linepos is the position of the line, counted in bytes from the beginning of the file. If it's the position counted in line numbers, you'll need to multiply by the number of bytes per line before seeking.
print "Located # line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()