Extract string between 2 delimiters - python

I'm trying to extract some words between two delimiters. It works for the files where the script find these delimiters, but for the others files, the code extract all of the file.
Example:
File 00.txt:
'bqukfkb saved qshfqs illjQNqdj iohqsijqsd qsoiqsdqs'
File 01.txt:
'jkhjkl dbdqs ihnzqid Bad value okkkk SPAN sfsdf didjsfsdf'
I want to open 2 or more files like these two and extract only words between:
'Bad Value' and 'SPAN'.
My code works for the file 01.txt, but not for the 00.txt ( i think it's because it doesn't find the delimiters so he prints everything. How can i fix it ?
def get_path(): #return the path of the selected file(s)
root = Tk()
i= datetime.datetime.now()
day = i.day
month=i.month
root.filename = filedialog.askopenfilenames(initialdir = "Z:\SGI\SYNCBBG",title = "Select your files",filetypes = (("Fichier 1","f6365tscf.SCD*"+str(month)+str(day)+".1"),("all files",".*")))
root.withdraw()
return (root.filename)
def extraction_error(file):
f=open(file,'r')
file=f.read()
f.close()
start = file.find('Bad value') +9
end = file.find('SPAN', start)
return(file[start:end])
paths=get_path()
cpt=len(paths)
for x in range(0,cpt):
print(extraction_error(paths[x]))
Output : saved qshfqs illjQNqdj iohqsijqsd qsoiqsdq
okkkk
So in this case i just want to extract 'okkkk' and not print ' saved....' for the other file.
Thanks in advance for your help

In your extraction_error function, you may want to test if the two key words can be found:
start = file.find('Bad value') # remove + 9 here, put it later
end = file.find('SPAN', start)
if start != -1 and end != -1: # test if key words can be found, -1 for not found:
return(file[start+9:end])
else:
return ""

You're printing out something, because you are adding 8 to the start variable. Find returns negative one if the string is not found. So what you end up doing is printing out the elements from [7:-1]. I would add an if statement before the print statement:
start = file.find('Bad value')
end = file.find('SPAN', start)
if start != -1 and end != -1:
print(file[start + 9: end])

string.find() return -1 if the argument is not found in the string, example:
print "abcd".find("e") # -1
You can just check the result before the return:
start = file.find('Bad value') + 9
end = file.find('SPAN', start)
if start == -1 or end == -1:
return '' # Or None
return(file[start:end])

Using re:
import re
def get_text(text):
pattern= r'.+(Bad value)(.+)(SPAN).+'
r=re.match(pattern,text)
if r!=None and len(r.groups()) == 3:
print(r.groups()[1])
lines = [
'jkhjkl dbdqs ihnzqid Bad value okkkk SPAN sfsdf didjsfsdf'
,'ghghujh']
for line in lines:
get_text(line)
Output:
okkkk

Related

probleme with old python scrip

I'm still republishing a python 2.x script in 3.x.
at some point, the script must replace the "print" function with "disp" (equivalent in TI basic language) except that it no longer works because of parentheses. anyone have an idea to fix it?
The code :
elif (line.find("print ")==idepth(line)):
line = replace(line,"print ","Disp ")
if (line[-1] == ","):
line = line[:-1].rstrip() # Trailing , not legal for ti basic
thanks in advance
Edit: full code :
import sys
import os
import re
#GUI:
import tkinter as tk
from tkinter import filedialog
import tkinter.simpledialog
import tkinter.messagebox
GUI_MODE = False
TAB_REPLACE = " "
def main():
args = sys.argv[1:]
global GUI_MODE
print (args)
if (len(args)==0):
GUI_MODE=True
root = tk.Tk()
root.withdraw()
inp = filedialog.askopenfilename(title="Select a python script to convert")
if (inp==''):
print ("cancelled")
return 0
else:
inp=args[0]
#now input file is known
file=open(inp)
prog = file.read().replace("\r","").split("\n") # Lines of code
#Get a program name:
if (prog[0][:1]=="#" and (prog[0].upper().find("NAME:")>-1 or prog[0].upper().find("PROGRAM:")>-1 )):
outname=prog[0][prog[0].find(":")+1:]
else:
outname=tkinter.simpledialog.askstring("File name","What do you want to name this program?")
fixed = format(prog)
#Write the converted program in this folder:
print ("\n--Converted to TI-Basic code:--")
print (fixed)
print ("")
print ("Making output files: "+outname+".tib, "+outname+".8xp ...")
#Write the converted program in this folder:
outfile=open(outname+".tib","w")
outfile.write(fixed)
outfile.close()
#Write the .8xp program
outfile=open(outname+".8xp","w")
outfile.write(fixed)
outfile.close()
#assuming the compiler tibasic.exe is in this folder:
if (sys.platform[:3]=="win"):
if (os.system('tibasic.exe '+outname+'.tib')): #Returns non-0, error:
errReport("Error trying to run tibasic.exe! Make sure it is in the current folder.")
else:
if (os.system('wine tibasic.exe '+outname+'.tib')): #Returns non-0, error:
errReport("Error trying to run tibasic.exe! Make sure it is in the current folder, and w.i.n.e is installed.\n"+
"(See http://www.winehq.org/ for installer)")
os.remove(outname+".tib")
a=input("Done! Press enter to exit:") #pause
return 0
def format(linesArray): #converts lines from Python to ti-basic.
for i in range(len(linesArray)):
linesArray[i]=linesArray[i].replace("\t",TAB_REPLACE) #Important! see idepth()
i=0;
linesArray.append("") # 0-indent ending so blockAddEnd won't mess up.
while (i<len(linesArray)):
#Convert control blocks (if, for, while) from indented (python) to END (TI)
line = linesArray[i]
if isBlockStart(line,"for "):
linesArray = blockAddEnd(linesArray, i, "End")
elif isBlockStart(line,"if "):
linesArray = blockAddEnd(linesArray, i, "End")
elif isBlockStart(line,"while "):
linesArray = blockAddEnd(linesArray, i, "End")
elif isBlockStart(line,"repeat "): #not in python, but works on TI.
linesArray = blockAddEnd(linesArray, i, "End")
i+=1
# Don't need indentations anymore, do the rest of the conversions:
for i in range(len(linesArray)):
linesArray[i]=convLine(linesArray[i],i+1)
#Remove blanks:
for i in range(linesArray.count("")):
linesArray.remove("")
return "\n".join(linesArray)
def convLine(line,num): #Line by line conversion.
line = line.rstrip().lstrip() #trim indentation.
lnum = "Line "+str(num)+": "
if line.count("#"):
comment = line[line.find("#"):]
if (comment[0:6] == "#no-ti"):
#Does not work on the ti.
return ""
elif (comment[0:4] == "#ti:"):
# Only for ti:
return comment[4:]
else:
line = line[:line.find("#")] # take comment off code
#No imports in ti-basic!
if line.startswith("import ") or (line.count(" import ") and line.startswith("from ")):
return ""
#Errors and warnings:
if (toolong(line)):
print (lnum+"Warning: Text string too long to fit on a TI83/84 screen. The calculator screen is 16 characters wide, 8 characters high.")
if (line.find("\n")>-1):
print (lnum+"Warning: newline \\n is not allowed in TI-Basic.")
if (line.find("'''")>-1):
print (lnum+"Warning: ''' quotes are not allowed, you must use \" quotes on a single line for TI-Basic.")
if (replace(line,"pow(","")!=line):
errReport(lnum+"TI calculators don't have the pow() command, you must use a**b instead of pow(a,b).")
if (replace(line,"import ","")!=line):
print (lnum+"import ignored. No import statements in TI-Basic!")
return "" # ignore import statements!
if (replace(line,"-=","")!=line):
errReport(lnum+"The -= operator is not allowed.\nTry +=- or a=a+-number instead.")
if (replace(line,"def ","")!=line):
errReport(lnum+"Functions are not supported in TI-Basic! However, you can run another program with \"prgmPRGNAME\".")
if (replace(line,"//","")!=line):
print (lnum+"// division converted to / division: For int division, try int(a/b).")
line=replace(line,"//","/")
if (replace(line,"-","")!=line):
print (lnum+"Warning: The - is changed to negative sign on the calculator. If you wanted to subtract, use a+-b instead of a-b.")
if (replace(line,"open(","")!=line):
errReport(lnum+"Error: TI calculators can't use \"open(filename)\" in programs. To store text, try using variables STR0, STR1, ... STR9.")
if (replace(line,"%","")!=line):
errReport(lnum+"Error: TI83/84 calculators don't have Mod.\n Instead of a % b, try (a/b-int(a/b))*b instead.")
# Replace excess spaces, they cause errors in the calculator:
line=replace(line,", ",",")
line=replace(line," + ","+")
line=replace(line," - ","-")
line=replace(line," +- ","+-")
line=replace(line," * ","*")
line=replace(line," / ","/")
line=replace(line," == ","==")
line=replace(line," > ",">")
line=replace(line," < ","<")
line=replace(line," != ","!=")
#TODO: Arrays converted to lists?
line=replace(line,"theta","[theta]") # variable
line=replace(line,"**","^")
line=mathReplace(line)
#round, max, min already works.
line=replace(line,"float(","(")
line=replace(line,"len(","dim(")
line=replace(line,"math.pi","[pi]")
line=replace(line,"math.e","[e]")
line=replace(line,"eval(","expr(")
line=replace(line,"-","[neg]") # use +- instead of - operator.
line=replace(line,"==","=")
line=replace(line," and ","&")
line=replace(line," or ","|")
line=replace(line,"random.random()","rand")
line=replace(line,"random.randint","RandInt")
line=replace(line,"int(","iPart(")
if (replace(line,"input(","") != line):
line=inputConv(line,num)
if isBlockStart(line,"for "):
line=forConv(line,num)
elif (isBlockStart(line,"if ")):
line = replace(line,"if ","If ")
line = replace(line,":",":Then")
elif (isBlockStart(line,"while ")):
line = replace(line,"while ","While ")
line = replace(line,":","")
elif (isBlockStart(line,"repeat")):
line = replace(line,"repeat","Repeat")
line = replace(line,":","")
elif (isBlockStart(line,"else")):
line = replace(line,"else:","Else")
elif isBlockStart(line,"elif"):
errReport(lnum+"""Error: There is no else-if command on the TI83/84. However, you can use this instead:
if <condition>:
...
else:
if <condition>:
...
else:
...""")
elif (line.find("print ")==idepth(line)):
line = re.sub(r"print *\((.+)\)", r"disp \1", line)
if (line[-1] == ","):
line = line[:-1].rstrip() # Trailing , not legal for ti basic
elif (replace(line,"=","")!=line): #assignment is -> on the calculator.
eqspace = line.find("=")
line = line[eqspace+1:].rstrip().lstrip() + "->" + line[:eqspace].rstrip().lstrip() # sto arrow.
line = fixEQ(line)
return replace(line,"+[neg]","-") #lastly, switch back the negative.
def fixEQ(line):
# fix +=, *=, /=.
# A+=1 changes to 1->A+, so fix it now.
if (line[-1]=="+" or line[-1]=="*" or line[-1]=="/"):
line = line[:-1].rstrip()+line[-1] # remove any spaces in "a +" etc
pre = line[line.find("->")+2:]
#pre = pre[:-1].rstrip()+pre[-1]
line= pre + "("+line[:line.find("->")]+")"+ line[line.find("->"):-1]
return line
def inputConv(line,num):
lnum = "Line "+str(num)+": "
if (replace(line,"raw_input(","")!=line and line==replace(line,"=","")):
#raw_input not assigned to variable is like Pause.
return "Pause "
else:
var = line[:line.find("=")].rstrip().lstrip()
if (len(var)>1 and var!="theta"): # might be invalid.
print (lnum+"Warning: Program tries to store to variable \"%s\"." % var)
prompt = line[line.find("input(")+6:]
prompt = prompt[:prompt.find(")")]
# Now return the TI basic input with var spaces removed:
return "Input "+prompt+","+var
def forConv(line,num):
lnum = "Line "+str(num)+": "
# split "for i in range(...):"
var = line[line.find("for ")+4:line.find(" in range")]
#print var
part = line[line.find("in range(")+9:] # only "...) : "
part = part.rstrip(": ")[:-1] # remove extra " " or ":", remove last ).
#print "'"+line+"'"
out = part.split(",")
if len(out)==1:
return "For(%s,0,(%s)-1)" % (var, out[0])
elif len(out)==2:
return "For(%s,(%s),(%s)-1)" % (var, out[0], out[1])
elif len(out)==3:
return "For(%s,(%s),(%s)-1,(%s)" % (var, out[0], out[1], out[2])
else:
errReport(lnum+"Too many commas in for loop!")
return "couldn't convert: "+line
def blockAddEnd(lines, startLine, endText):
# Takes an array, line #, and end text.
# Adds end for that indentation block.
startInd = idepth(lines[startLine])
if idepth(lines[startLine+1]) <= startInd:
errReport("Expected indent after line "+str(startLine+1)+".")
i = startLine+1
#continue searching for the end while it's indented or it's an else line:
while idepth(lines[i]) > startInd or (isBlockStart(lines[i],"else")):
i+=1
# now insert.
lines.insert(i,endText)
return lines
def idepth(text):
# get indentation depth of line.
depth=0
line = text.replace("\t",TAB_REPLACE) #tab is 4 spaces.
while (line[:1]==" "):
line=line[1:]
depth+=1
return depth
def replace(text, changethis, tothis):
# replaces text, but not in quotes.
arr = text.split("\"")
for i in range(0,len(arr),2):
arr[i]=arr[i].replace(changethis, tothis)
return "\"".join(arr)
def toolong(text):
# checks for too long string:
arr = text.split("\"")
for i in range(1,len(arr),2):
#print arr[i]
if (len(arr[i]) > 16):
return True
return False
def parMatch(text,num): # given "(stuff()...()))", returns the parentheses block.
lnum = "Line "+str(num)+": "
for i in range(len(text)):
part = text[:i-1]
if (part.count("(")==part.count(")")):
return part[1:-1] #without outside parentheses.
errReport(lnum+"Invalid parentheses")
def isBlockStart(line, type):
# Check if the line is start of a <type> block.
# checks if it starts with <type>, and ends with ":".
# example: isBlockStart("for i in range(8) : ","for") is true.
return (line.find(type) == idepth(line) and line.rstrip(" ")[-1]==":")
def errReport(text):
print (text)
if (GUI_MODE):
root = tk.Tk()
root.withdraw()
tkinter.messagebox.showerror("Error",text)
sys.exit(1)
def mathReplace(line):
""" Replaces mathematical functions with ti basic functions. """
#Same function in both Python and TI-basic:
same = ["sin", "cos", "tan", "asin", "acos", "atan", "sinh", "cosh", "tanh", "asinh", "acosh", "atanh"]
line=replace(line,"math.sqrt(","[root]^2(")
line=replace(line,"math.fabs(","abs(")
for func in same:
line = replace(line,"math.%s(" % func,func)
line=replace(line,"math.log(","ln(")
line=replace(line,"math.exp(","e^(")
line=replace(line,"math.floor(","int(")
line=replace(line,"math.log10(","log(")
#same, but without "math." They might use
#from math import sqrt etc...
line=replace(line,"sqrt(","[root]^2(")
line=replace(line,"fabs(","abs(")
for func in same:
line = replace(line, "%s(" % func,func)
#(Redundant lines deleted)
line=replace(line,"log(","ln(")
line=replace(line,"exp(","e^(")
line=replace(line,"floor(","int(")
line=replace(line,"log10(","log(")
return line
if __name__ == '__main__': main()
it's the original code with changement propose below
You can use python's regular expression library to perform more advanced string matching and replacement than replace(). Specifically, re.sub(), which functions the same as replace() but takes regular expressions instead of simple strings.
Be sure to first import it with import re. You can then do the following:
elif (line.find("print ")==idepth(line)):
line = re.sub(r"print *\(\"(.+)\"\)", r"disp \1", line)
if (line[-1] == ","):
line = line[:-1].rstrip() # Trailing , not legal for ti basic
This will look for a string of the format "print("&1")" or "print ("&1")" and replace it with "disp &1", where &1 is the contents between the quotes.
Edit: You had initially specified that you wanted the output "without the quotes", but it seems you've edited that comment. If you want to include the quotes in the output, use this line instead:
line = re.sub(r"print *\((.+)\)", r"disp \1", line)

Python 3.6 script surprisingly slow on Windows 10 but not on Ubuntu 17.10

I recently had to write a challenge for a company that was to merge 3 CSV files into one based on the first attribute of each (the attributes were repeating in all files).
I wrote the code and sent it to them, but they said it took 2 minutes to run. That was funny because it ran for 10 seconds on my machine. My machine had the same processor, 16GB of RAM, and had an SSD as well. Very similar environments.
I tried optimising it and resubmitted it. This time they said they ran it on an Ubuntu machine and got 11 seconds, while the code ran for 100 seconds on the Windows 10 still.
Another peculiar thing was that when I tried profiling it with the Profile module, it went on forever, had to terminate after 450 seconds. I moved to cProfiler and it recorded it for 7 seconds.
EDIT: The exact formulation of the problem is
Write a console program to merge the files provided in a timely and
efficient manner. File paths should be supplied as arguments so that
the program can be evaluated on different data sets. The merged file
should be saved as CSV; use the id column as the unique key for
merging; the program should do any necessary data cleaning and error
checking.
Feel free to use any language you’re comfortable with – only
restriction is no external libraries as this defeats the purpose of
the test. If the language provides CSV parsing libraries (like
Python), please avoid using them as well as this is a part of the
test.
Without further ado here's the code:
#!/usr/bin/python3
import sys
from multiprocessing import Pool
HEADERS = ['id']
def csv_tuple_quotes_valid(a_tuple):
"""
checks if a quotes in each attribute of a entry (i.e. a tuple) agree with the csv format
returns True or False
"""
for attribute in a_tuple:
in_quotes = False
attr_len = len(attribute)
skip_next = False
for i in range(0, attr_len):
if not skip_next and attribute[i] == '\"':
if i < attr_len - 1 and attribute[i + 1] == '\"':
skip_next = True
continue
elif i == 0 or i == attr_len - 1:
in_quotes = not in_quotes
else:
return False
else:
skip_next = False
if in_quotes:
return False
return True
def check_and_parse_potential_tuple(to_parse):
"""
receives a string and returns an array of the attributes of the csv line
if the string was not a valid csv line, then returns False
"""
a_tuple = []
attribute_start_index = 0
to_parse_len = len(to_parse)
in_quotes = False
i = 0
#iterate through the string (line from the csv)
while i < to_parse_len:
current_char = to_parse[i]
#this works the following way: if we meet a quote ("), it must be in one
#of five cases: "" | ", | ," | "\0 | (start_of_string)"
#in case we are inside a quoted attribute (i.e. "123"), then commas are ignored
#the following code also extracts the tuples' attributes
if current_char == '\"':
if i == 0 or (to_parse[i - 1] == ',' and not in_quotes): # (start_of_string)" and ," case
#not including the quote in the next attr
attribute_start_index = i + 1
#starting a quoted attr
in_quotes = True
elif i + 1 < to_parse_len:
if to_parse[i + 1] == '\"': # "" case
i += 1 #skip the next " because it is part of a ""
elif to_parse[i + 1] == ',' and in_quotes: # ", case
a_tuple.append(to_parse[attribute_start_index:i].strip())
#not including the quote and comma in the next attr
attribute_start_index = i + 2
in_quotes = False #the quoted attr has ended
#skip the next comma - we know what it is for
i += 1
else:
#since we cannot have a random " in the middle of an attr
return False
elif i == to_parse_len - 1: # "\0 case
a_tuple.append(to_parse[attribute_start_index:i].strip())
#reached end of line, so no more attr's to extract
attribute_start_index = to_parse_len
in_quotes = False
else:
return False
elif current_char == ',':
if not in_quotes:
a_tuple.append(to_parse[attribute_start_index:i].strip())
attribute_start_index = i + 1
i += 1
#in case the last attr was left empty or unquoted
if attribute_start_index < to_parse_len or (not in_quotes and to_parse[-1] == ','):
a_tuple.append(to_parse[attribute_start_index:])
#line ended while parsing; i.e. a quote was openned but not closed
if in_quotes:
return False
return a_tuple
def parse_tuple(to_parse, no_of_headers):
"""
parses a string and returns an array with no_of_headers number of headers
raises an error if the string was not a valid CSV line
"""
#get rid of the newline at the end of every line
to_parse = to_parse.strip()
# return to_parse.split(',') #if we assume the data is in a valid format
#the following checking of the format of the data increases the execution
#time by a factor of 2; if the data is know to be valid, uncomment 3 lines above here
#if there are more commas than fields, then we must take into consideration
#how the quotes parse and then extract the attributes
if to_parse.count(',') + 1 > no_of_headers:
result = check_and_parse_potential_tuple(to_parse)
if result:
a_tuple = result
else:
raise TypeError('Error while parsing CSV line %s. The quotes do not parse' % to_parse)
else:
a_tuple = to_parse.split(',')
if not csv_tuple_quotes_valid(a_tuple):
raise TypeError('Error while parsing CSV line %s. The quotes do not parse' % to_parse)
#if the format is correct but more data fields were provided
#the following works faster than an if statement that checks the length of a_tuple
try:
a_tuple[no_of_headers - 1]
except IndexError:
raise TypeError('Error while parsing CSV line %s. Unknown reason' % to_parse)
#this replaces the use my own hashtables to store the duplicated values for the attributes
for i in range(1, no_of_headers):
a_tuple[i] = sys.intern(a_tuple[i])
return a_tuple
def read_file(path, file_number):
"""
reads the csv file and returns (dict, int)
the dict is the mapping of id's to attributes
the integer is the number of attributes (headers) for the csv file
"""
global HEADERS
try:
file = open(path, 'r');
except FileNotFoundError as e:
print("error in %s:\n%s\nexiting...")
exit(1)
main_table = {}
headers = file.readline().strip().split(',')
no_of_headers = len(headers)
HEADERS.extend(headers[1:]) #keep the headers from the file
lines = file.readlines()
file.close()
args = []
for line in lines:
args.append((line, no_of_headers))
#pool is a pool of worker processes parsing the lines in parallel
with Pool() as workers:
try:
all_tuples = workers.starmap(parse_tuple, args, 1000)
except TypeError as e:
print('Error in file %s:\n%s\nexiting thread...' % (path, e.args))
exit(1)
for a_tuple in all_tuples:
#add quotes to key if needed
key = a_tuple[0] if a_tuple[0][0] == '\"' else ('\"%s\"' % a_tuple[0])
main_table[key] = a_tuple[1:]
return (main_table, no_of_headers)
def merge_files():
"""
produces a file called merged.csv
"""
global HEADERS
no_of_files = len(sys.argv) - 1
processed_files = [None] * no_of_files
for i in range(0, no_of_files):
processed_files[i] = read_file(sys.argv[i + 1], i)
out_file = open('merged.csv', 'w+')
merged_str = ','.join(HEADERS)
all_keys = {}
#this is to ensure that we include all keys in the final file.
#even those that are missing from some files and present in others
for processed_file in processed_files:
all_keys.update(processed_file[0])
for key in all_keys:
merged_str += '\n%s' % key
for i in range(0, no_of_files):
(main_table, no_of_headers) = processed_files[i]
try:
for attr in main_table[key]:
merged_str += ',%s' % attr
except KeyError:
print('NOTE: no values found for id %s in file \"%s\"' % (key, sys.argv[i + 1]))
merged_str += ',' * (no_of_headers - 1)
out_file.write(merged_str)
out_file.close()
if __name__ == '__main__':
# merge_files()
import cProfile
cProfile.run('merge_files()')
# import time
# start = time.time()
# print(time.time() - start);
Here is the profiler report I got on my Windows.
EDIT: The rest of the csv data provided is here. Pastebin was taking too long to process the files, so...
It might not be the best code and I know that, but my question is what slows down Windows so much that doesn't slow down an Ubuntu? The merge_files() function takes the longest, with 94 seconds just for itself, not including the calls to other functions. And there doesn't seem to be anything too obvious to me for why it is so slow.
Thanks
EDIT: Note: We both used the same dataset to run the code with.
It turns out that Windows and Linux handle very long strings differently. When I moved the out_file.write(merged_str) inside the outer for loop (for key in all_keys:) and stopped appending to merged_str, it ran for 11 seconds as expected. I don't have enough knowledge on either of the OS's memory management systems to be able to give a prediction on why it is so different.
But I would say that the way that the second one (the Windows one) is the more fail-safe method because it is unreasonable to keep a 30 MB string in memory. It just turns out that Linux sees that and doesn't always try to keep the string in cache, or to rebuild it every time.
Funny enough, initially I did run it a few times on my Linux machine with these same writing strategies, and the one with the large string seemed to go faster, so I stuck with it. I guess you never know.
Here's the modified code
for key in all_keys:
merged_str = '%s' % key
for i in range(0, no_of_files):
(main_table, no_of_headers) = processed_files[i]
try:
for attr in main_table[key]:
merged_str += ',%s' % attr
except KeyError:
print('NOTE: no values found for id %s in file \"%s\"' % (key, sys.argv[i + 1]))
merged_str += ',' * (no_of_headers - 1)
out_file.write(merged_str + '\n')
out_file.close()
When I run your solution on Ubuntu 16.04 with the three given files, it seems to take ~8 seconds to complete. The only modification I made was to uncomment the timing code at the bottom and use it.
$ python3 dimitar_merge.py file1.csv file2.csv file3.csv
NOTE: no values found for id "aaa5d09b-684b-47d6-8829-3dbefd608b5e" in file "file2.csv"
NOTE: no values found for id "38f79a49-4357-4d5a-90a5-18052ef03882" in file "file2.csv"
NOTE: no values found for id "766590d9-4f5b-4745-885b-83894553394b" in file "file2.csv"
8.039648056030273
$ python3 dimitar_merge.py file1.csv file2.csv file3.csv
NOTE: no values found for id "38f79a49-4357-4d5a-90a5-18052ef03882" in file "file2.csv"
NOTE: no values found for id "766590d9-4f5b-4745-885b-83894553394b" in file "file2.csv"
NOTE: no values found for id "aaa5d09b-684b-47d6-8829-3dbefd608b5e" in file "file2.csv"
7.78482985496521
I rewrote my first attempt without using csv from the standard library and am now getting times of ~4.3 seconds.
$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.332579612731934
$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.305467367172241
$ python3 lettuce_merge.py file1.csv file2.csv file3.csv
4.27345871925354
This is my solution code (lettuce_merge.py):
from collections import defaultdict
def split_row(csv_row):
return [col.strip('"') for col in csv_row.rstrip().split(',')]
def merge_csv_files(files):
file_headers = []
merged_headers = []
for i, file in enumerate(files):
current_header = split_row(next(file))
unique_key, *current_header = current_header
if i == 0:
merged_headers.append(unique_key)
merged_headers.extend(current_header)
file_headers.append(current_header)
result = defaultdict(lambda: [''] * (len(merged_headers) - 1))
for file_header, file in zip(file_headers, files):
for line in file:
key, *values = split_row(line)
for col_name, col_value in zip(file_header, values):
result[key][merged_headers.index(col_name) - 1] = col_value
file.close()
quotes = '"{}"'.format
with open('lettuce_merged.csv', 'w') as f:
f.write(','.join(quotes(a) for a in merged_headers) + '\n')
for key, values in result.items():
f.write(','.join(quotes(b) for b in [key] + values) + '\n')
if __name__ == '__main__':
from argparse import ArgumentParser, FileType
from time import time
parser = ArgumentParser()
parser.add_argument('files', nargs='*', type=FileType('r'))
args = parser.parse_args()
start_time = time()
merge_csv_files(args.files)
print(time() - start_time)
I'm sure this code could be optimized even further but sometimes just seeing another way to solve a problem can help spark new ideas.

parsing a string in python for #hashtag

I am wondering, how could I make an algorithm that parses a string for the hashtag symbol ' # ' and returns the full string, but where ever a word starts with a '#' symbol, it becomes a link. I am using python with Google app engine: webapp2 and Jinja2 and I am building a blog.
Thanks
A more efficient and complete way to find the "hashwords":
import functools
def hash_position(string):
return string.find('#')
def delimiter_position(string, delimiters):
positions = filter(lambda x: x >= 0, map(lambda delimiter: string.find(delimiter), delimiters))
try:
return functools.reduce(min, positions)
except TypeError:
return -1
def get_hashed_words(string, delimiters):
maximum_length = len(string)
current_hash_position = hash_position(string)
string = string[current_hash_position:]
results = []
counter = 0
while current_hash_position != -1:
current_delimiter_position = delimiter_position(string, delimiters)
if current_delimiter_position == -1:
results.append(string)
else:
results.append(string[0:current_delimiter_position])
# Update offsets and the haystack
string = string[current_delimiter_position:]
current_hash_position = hash_position(string)
string = string[current_hash_position:]
return results
if __name__ == "__main__":
string = "Please #clarify: What do you #mean with returning somthing as a #link. #herp"
delimiters = [' ', '.', ',', ':']
print(get_hashed_words(string, delimiters))
Imperative code with updates of the haystack looks a little bit ugly but hey, that's what we get for (ab-)using mutable variables.
And I still have no idea what do you mean with "returning something as a link".
Hope that helps.
not sure where do you get the data for the link, but maybe something like:
[('%s' % word) for word in input.split() if word[0]=='#']
Are you talking about twitter? Maybe this?
def get_hashtag_link(hashtag):
if hashtag.startswith("#"):
return '%s' % (hashtag[1:], hashtag)
>>> get_hashtag_link("#stackoverflow")
'#stackoverflow'
It will return None if hashtag is not a hashtag.

sublime plugin: find and select text

I got plugin for sublime text 3 that let me move cursor to line by its number:
import sublime, sublime_plugin
class prompt_goto_lineCommand(sublime_plugin.WindowCommand):
def run(self):
self.window.show_input_panel("Goto Line:", "", self.on_done, None, None)
pass
def on_done(self, text):
try:
line = int(text)
if self.window.active_view():
self.window.active_view().run_command("goto_line", {"line": line} )
except ValueError:
pass
class go_to_lineCommand(sublime_plugin.TextCommand):
def run(self, edit, line):
# Convert from 1 based to a 0 based line number
line = int(line) - 1
# Negative line numbers count from the end of the buffer
if line < 0:
lines, _ = self.view.rowcol(self.view.size())
line = lines + line + 1
pt = self.view.text_point(line, 0)
self.view.sel().clear()
self.view.sel().add(sublime.Region(pt))
self.view.show(pt)
I want to improve it to let me move cursor to first line containing the specified string. It is like a search on file:
For example if pass to it string "class go_to_lineCommand" plugin must move cursor to line 17 :
and possibly select string class go_to_lineCommand.
The problem is reduced to finding regionWithGivenString, and then I can select it:
self.view.sel().add(regionWithGivenString)
But don't know method to get regionWithGivenString.
I tried to
find on google: sublime plugin find and select text
check api
But still no result.
I am not sure about the typical way. However, you can achieve this in following way:
Get the content of current doc.
Search target string to find out its start and end position. Now you have the start and end point.
Add the Region(start, end) to selections.
Example:
def run(self, edit, target):
if not target or target == "":
return
content = self.view.substr(sublime.Region(0, self.view.size()))
begin = content.find(target)
if begin == -1:
return
end = begin + len(target)
target_region = sublime.Region(begin, end)
self.view.sel().clear()
self.view.sel().add(target_region)
there you have it in the API, use the view.find(regex,pos) method.
s = self.view.find("go_to_lineCommand", 0)
self.view.sel().add(s)
http://www.sublimetext.com/docs/3/api_reference.html
A possible improvement to the longhua's answer - adding moving cursor to the target line.
class FindcustomCommand(sublime_plugin.TextCommand):
def _select(self):
self.view.sel().clear()
self.view.sel().add(self._target_region)
def run(self, edit):
TARGET = 'http://nabiraem'
# if not target or target == "":
# return
content = self.view.substr(sublime.Region(0, self.view.size()))
begin = content.find(TARGET)
if begin == -1:
return
end = begin + len(TARGET)
self._target_region = sublime.Region(begin, end)
self._select()
self.view.show(self._target_region) # scroll to selection

Problems porting from Python to Ruby

I have a neat little script in python that I would like to port to Ruby and I think it's highlighting my noobishness at Ruby. I'm getting the error that there is an unexpected END statement, but I don't see how this can be so. Perhaps there is a keyword that requires an END or something that doesn't want an END that I forgot about. Here is all of the code leading up to the offending line Offending line is commented.
begin
require base64
require base32
rescue LoadError
puts "etext requires base32. use 'gem install --remote base32' and try again"
end
# Get a string from a text file from disk
filename = ARGV.first
textFile = File.open(filename)
text = textFile.read()
mailType = "text only" # set the default mailType
#cut the email up by sections
textList1 = text.split(/\n\n/)
header = textList1[0]
if header.match (/MIME-Version/)
mailType = "MIME"
end
#If mail has no attachments, parse as text-only. This is the class that does this
class TextOnlyMailParser
def initialize(textList)
a = 1
body = ""
header = textList[0]
#parsedEmail = Email.new(header)
while a < textList.count
body += ('\n' + textList[a] + '\n')
a += 1
end
#parsedEmail.body = body
end
end
def separate(text,boundary = nil)
# returns list of strings and lists containing all of the parts of the email
if !boundary #look in the email for "boundary= X"
text.scan(/(?<=boundary=).*/) do |bound|
textList = recursiveSplit(text,bound)
end
return textList
end
if boundary
textList = recursiveSplit(text,boundary)
end
end
def recursiveSplit(chunk,boundary)
if chunk.is_a? String
searchString = "--" + boundary
ar = cunk.split(searchString)
return ar
elsif chunk.is_a? Array
chunk do |bit|
recursiveSplit(bit,boundary);
end
end
end
class MIMEParser
def initialize(textList)
#textList = textList
#nestedItems = []
newItem = NestItem.new(self)
newItem.value = #textList[0]
newItem.contentType = "Header"
#nestedItems.push(newItem)
#setup parsed email
#parsedEmail = Email.new(newItem.value)
self._constructNest
end
def checkForContentSpecial(item)
match = item.value.match (/Content-Disposition: attachment/)
if match
filename = item.value.match (/(?<=filename=").+(?=")/)
encoding = item.value.match (/(?<=Content-Transfer-Encoding: ).+/)
data = item.value.match (/(?<=\n\n).*(?=(\n--)|(--))/m)
dataGroup = data.split(/\n/)
dataString = ''
i = 0
while i < dataGroup.count
dataString += dataGroup[i]
i ++
end #<-----THIS IS THE OFFENDING LINE
#parsedEmail.attachments.push(Attachment.new(filename,encoding,dataString))
end
Your issue is the i ++ line, Ruby does not have a post or pre increment/decrement operators and the line is failing to parse. I can't personally account as to why i++ evaluates in IRB but i ++ does not perform any action.
Instead replace your ++ operators with += 1 making that last while:
while i < dataGroup.count
dataString += dataGroup[i]
i += 1
end
But also think about the ruby way, if you're just adding that to a string why not do a dataString = dataGroup.join instead of looping over with a while construct?

Categories

Resources