The client's name is after the word "for" and before the opening parenthesis "(" that starts the proposal number. I need to extract the client name to use to look up the deal in a future step. What would be the easiest way to set this up? Using Zapier Extract Pattern or to Use Zapier Code in Python?
I have tried this and it did not work. It seemed promising though.
input_data
client = Reminder: Leruths has sent you a proposal for Business Name (#642931)
import regex
rgx = regex.compile(r'(?si)(?|{0}(.*?){1}|{1}(.*?)
{0})'.format('for', '('))
s1 = 'client'
for s in [s1]:
m = rgx.findall
for x in m:
print x.strip()
I have also tried this and it did not work.
start = mystring.find( 'for' )
end = mystring.find( '(' )
if start != -1 and end != -1:
result = mystring[start+1:end]
I am looking for Business Name to be returned in my example.
Fastest way:
start = client.find('for')
end = client.find('(')
result = client[start+4:end-1]
print(result)
With regex:
result = re.search(r' for (.*) [(]', client)
print(result.group(1))
There is probably a cleaner way to do this, but here is another solution without regex
client = "Reminder: Leruths has sent you a proposal for Business Name (#642931)"
cs = client.split(" ")
name = ""
append = False
for word in cs:
if "for" == word:
append = True
elif word.startswith("("):
append = False
if append is True and word != "for":
name += (word + " ")
name = name.strip()
print(name)
Another method:
client = "Reminder: Leruths has sent you a proposal for Business Name (#642931)"
cs = client.split(" ")
name = ""
forindex = cs.index("for")
for i in range(forindex+1, len(cs)):
if cs[i].startswith("("):
break
name += cs[i] + " "
name = name.strip()
print(name)
Running the code below gives:
Regex method took 2.3912417888641357 seconds
Search word by word method took 4.78193998336792 seconds
Search with list index method took 3.1756017208099365 seconds
String indexing method took 0.8496286869049072 seconds
Code to check the fastest to get the name over a million tries:
import re
import time
client = "Reminder: Leruths has sent you a proposal for Business Name (#642931)"
def withRegex(client):
result = re.search(r' for (.*) [(]', client)
return(result.group(1))
def searchWordbyWord(client):
cs = client.split(" ")
name = ""
append = False
for word in cs:
if "for" == word:
append = True
elif word.startswith("("):
append = False
if append is True and word != "for":
name += (word + " ")
name = name.strip()
return name
def searchWithListIndex(client):
cs = client.split(" ")
name = ""
forindex = cs.index("for")
for i in range(forindex+1, len(cs)):
if cs[i].startswith("("):
break
name += cs[i] + " "
name = name.strip()
return name
def stringIndexing(client):
start = client.find('for')
end = client.find('(')
result = client[start+4:end-1]
return result
wr = time.time()
for x in range(1,1000000):
withRegex(client)
wr = time.time() - wr
print("Regex method took " + str(wr) + " seconds")
sw = time.time()
for x in range(1,1000000):
searchWordbyWord(client)
sw = time.time() - sw
print("Search word by word method took " + str(sw) + " seconds")
wl = time.time()
for x in range(1,1000000):
searchWithListIndex(client)
wl = time.time() - wl
print("Search with list index method took " + str(wl) + " seconds")
si = time.time()
for x in range(1,1000000):
stringIndexing(client)
si = time.time() - si
print("String indexing method took " + str(si) + " seconds")
I have two string i.e. 'This is a test as146634546576 string 12312523' and 'This is a test as576 string 12344612523'
Now I want to print the largest numbers i.e. 146634546576 and 12344612523 respectively. I have written the following code but it's printing only 146634546576 and 576. Where it should be 12344612523 instead of 576!
def findLargestNumber(text):
front = -1
li = []
li1 = []
for i in range(len(text)):
if front == -1:
if text[i].isdigit():
front = i
else:
continue
else:
if text[i].isdigit():
continue
else:
li.append(int(text[front:i+1]))
front = -1
return max(li)
#print max(li)
for w in text.split():
li1.append(int(w))
return max(li1)
#print max(li1)
if max(li)>max(li1):
return max(li)
else:
return max(li1)
print findLargestNumber('This is a test as146634546576 string 12312523')
print findLargestNumber('This is a test as576 string 12344612523')
Use max() with re.findall:
import re
a = 'This is a test as576 string 12344612523'
print(max(map(int, re.findall(r'\d+', a))))
# 12344612523
import re
a = 'This is a test as146634546576 string 12312523'
b = 'This is a test as576 string 12344612523'
num_in_a = re.findall(r'[\d]+', a)
num_in_b = re.findall(r'[\d]+', b)
print(max(map(int, num_in_a)))
print(max(map(int, num_in_b)))
Output:
146634546576
12344612523
import re
pa = re.compile(r'(\d+)')
def findLargestNumber(text):
ma = pa.findall(text)
num = [int(x) for x in ma]
print(max(num))
findLargestNumber('This is a test as576 string 12344612523')
findLargestNumber('This is a test as146634546576 string 12312523')
I have this string: a9*a9 + a10*a10
and I would like to have: a9*a8 + a10*a9
I think re.sub() from Python should be useful, but I am not familiar with group() that I've seen in some examples. Any help would be appreciated.
here's another solution method:
import re
s = 'a9*a9 + a10*a10 + a8*a8 + a255*a255 + b58*b58 + c58*c58'
string = re.sub('[ ]', '', s) # removed whitespace from string (optional:only if you are not sure how many space you can get in string)
x = string.split('+')
pattern = re.compile(r'([a-z])([\d]+)')
ans = ''
for element in x:
for letter, num in re.findall(pattern, element):
st = ''
for i in range(len(element.split('*'))):
st = st + '*' + (letter+str(int(num)-i))
# print(str(letter) + str(int(num)-i))
ans = ans + '+' + st[1:]
print(ans[1:])
Output :
a9*a8+a10*a9+a8*a7+a255*a254+b58*b57+c58*c57
Assuming the structure of the input is a\d*a\d + a\d*a\d + ... you can use a callback in the re.sub function:
import re
def decrement(match):
if match.group(1) != match.group(2):
return match.group(0)
return 'a{}*a{}'.format(match.group(1), str(int(match.group(2)) - 1))
re.sub(r'a(\d)\*a(\d)', decrement, 'a3*a3 + a5*a5 + a3*a7')
# a3*a2 + a5*a4 + a3*a7
I am trying to strip a line of code so that only the comment at the end is saved. Because # signs can be included within "" marks, to do this I am trying to cycle through the line catching pairs of " marks so that it ignores any # marks within "" marks. When I use a code visualiser on my code below, after the second for loop it seems to go pack to processing s as if it has just stripped the first " mark. I can't see what I'm doing wrong here, because the print statement I have included on line 19 shows that s has been stripped to after the second ", but when the code returns to the top, it starts cycling again from after the first ". Any idea of what I am doing wrong here?
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
quoteCount = 0
for char in s:
if quoteCount%2 == 0:
if char == '#':
s = s[s.index('#'):]
break
if char == '"':
quoteCount = quoteCount + 1
s = s[s.index('"'):]
s = s.lstrip('"')
for char in s:
if char == '"':
quoteCount = quoteCount + 1
s = s[s.index('"'):]
s = s.lstrip('"')
print(s)
break
print(s)
If I understand your question correctly you only want to keep the last comment (#lots of hash(#) symbols here).
To do this you don't need the nested for loop.
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
quoteCount = 0
for char in s:
if quoteCount%2 == 0:
if char == '#':
s = s[s.index('#'):]
break
if char == '"':
quoteCount = quoteCount + 1
s = s[s.index('"'):]
s = s.lstrip('"')
print(s)
Easier to remove the quoted strings with a regular expression:
import re
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
pattern = r'"[^"]*"'
s = re.sub(pattern, '', s)
print s[s.index('#'):]
Output:
#lots of hash(#) symbols here
Your code is overly complicated so I suggest you use an alternative method to finding the comment like the already mentioned regex one or the one I came up with.
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
s = s[s.rfind('"') + 1:] # Get to the last quotation mark
if s.find('#') >= 0: # The first # sign should start the comment if there is one
s = s[s.find('#'):]
else:
s = '' # No comment was found
print(s)
I want to extract certain values from a string in python.
snp_1_881627 AA=G;ALLELE=A;DAF_GLOBAL=0.473901;GENE_TRCOUNT_AFFECTED=1;GENE_TRCOUNT_TOTAL=1;SEVERE_GENE=ENSG00000188976;SEVERE_IMPACT=SYNONYMOUS_CODON;TR_AFFECTED=FULL;ANNOTATION_CLASS=REG_FEATURE,SYNONYMOUS_CODON,ACTIVE_CHROM,NC_TRANSCRIPT_VARIANT,NC_TRANSCRIPT_VARIANT;A_A_CHANGE=.,L,.,.,.;A_A_LENGTH=.,750,.,.,.;A_A_POS=.,615,.,.,.;CELL=GM12878,.,GM12878,.,.;CHROM_STATE=.,.,11,.,.;EXON_NUMBER=.,16/19,.,.,.;GENE_ID=.,ENSG00000188976,.,ENSG00000188976,ENSG00000188976;GENE_NAME=.,NOC2L,.,NOC2L,NOC2L;HGVS=.,c.1843N>T,.,n.3290N>T,n.699N>T;REG_ANNOTATION=H3K36me3,.,.,.,.;TR_BIOTYPE=.,PROTEIN_CODING,.,PROCESSED_TRANSCRIPT,PROCESSED_TRANSCRIPT;TR_ID=.,ENST00000327044,.,ENST00000477976,ENST00000483767;TR_LENGTH=.,2790,.,4201,1611;TR_POS=.,1893,.,3290,699;TR_STRAND=.,-1,.,-1,-1
Output:
GENE_ID GENE_NAME EXON_NUMBER SEVERE_IMPACT
snp_1_881627 ENSG00000188976 NOC2L 16/19 SYNONYMOUS_CODON
If the string has values for each of those variables(GENE_ID,GENE_NAME,EXON_NUMBER) existing then output, else "NA"(variables don't exist or their values don't exist).In some cases,these variables don't exist in the string.
Which string method should I use to accomplish this?Should I split my string before extracting any values?I have 10k rows to extract values for each snp_*
string=string.split(';')
P.S. I am a newbie in python
There are two general strategies for this - split and regex.
To use split, first split off the row label (snp_1_881627):
rowname, data = row.split()
Then, you can split data into the individual entries using the ; separator:
data = data.split(';')
Since you need to get the value of certain keys, we can turn it into a dictionary:
dataDictionary = {}
for entry in data:
entry = entry.split('=')
dataDictionary[entry[0]] = entry[1] if len(entry) > 1 else None
Then you can simply check if the keys are in dataDictionary, and if so grab their values.
Using split is nice in that it will index everything in the data string, making it easy to grab whichever ones you need.
If the ones you need will not change, then regex might be a better option:
>>> import re
>>> re.search('(?<=GENE_ID=)[^;]*', 'onevalue;GENE_ID=SOMETHING;othervalue').group()
'SOMETHING'
Here I'm using a "lookbehind" to match one of the keywords, then grabbing the value from the match using group(). Putting your keywords into a list, you could find all the values like this:
import re
...
keywords = ['GENE_ID', 'GENE_NAME', 'EXON_NUMBER', 'SEVERE_IMPACT']
desiredValues = {}
for keyword in keywords:
match = re.search('(?<={}=)[^;]*'.format(keyword), string_to_search)
desiredValues[keyword] = match.group() if match else DEFAULT_VALUE
I think this is going to be the solution you are looking for.
#input
user_in = 'snp_1_881627 AA=G;ALLELE=A;DAF_GLOBAL=0.473901;GENE_TRCOUNT_AFFECTED=1;GENE_TRCOUNT_TOTAL=1;SEVERE_GENE=ENSG00000188976;SEVERE_IMPACT=SYNONYMOUS_CODON;TR_AFFECTED=FULL;ANNOTATION_CLASS=REG_FEATURE,SYNONYMOUS_CODON,ACTIVE_CHROM,NC_TRANSCRIPT_VARIANT,NC_TRANSCRIPT_VARIANT;A_A_CHANGE=.,L,.,.,.;A_A_LENGTH=.,750,.,.,.;A_A_POS=.,615,.,.,.;CELL=GM12878,.,GM12878,.,.;CHROM_STATE=.,.,11,.,.;EXON_NUMBER=.,16/19,.,.,.;GENE_ID=.,ENSG00000188976,.,ENSG00000188976,ENSG00000188976;GENE_NAME=.,NOC2L,.,NOC2L,NOC2L;HGVS=.,c.1843N>T,.,n.3290N>T,n.699N>T;REG_ANNOTATION=H3K36me3,.,.,.,.;TR_BIOTYPE=.,PROTEIN_CODING,.,PROCESSED_TRANSCRIPT,PROCESSED_TRANSCRIPT;TR_ID=.,ENST00000327044,.,ENST00000477976,ENST00000483767;TR_LENGTH=.,2790,.,4201,1611;TR_POS=.,1893,.,3290,699;TR_STRAND=.,-1,.,-1,-1'
#set some empty vars
user_in = user_in.split(';')
final_output = ""
GENE_ID_FOUND = False
GENE_NAME_FOUND = False
EXON_NUMBER_FOUND = False
GENE_ID_OUTPUT = ''
GENE_NAME_OUTPUT = ''
EXON_NUMBER_OUTPUT = ''
SEVERE_IMPACT_OUTPUT = ''
for x in range(0, len(user_in)):
if x == 0:
first_line_count = 0
first_line_print = ''
while(user_in[0][first_line_count] != " "):
first_line_print += user_in[0][first_line_count]
first_line_count += 1
final_output += first_line_print + "\t"
else:
if user_in[x][0:11] == "SEVERE_GENE":
GENE_ID_OUTPUT += user_in[x][12:] + "\t"
GENE_ID_FOUND = True
if user_in[x][0:9] == "GENE_NAME":
GENE_NAME_OUTPUT += user_in[x][10:] + "\t"
GENE_NAME_FOUND = True
if user_in[x][0:11] == "EXON_NUMBER":
EXON_NUMBER_OUTPUT += user_in[x][12:] + "\t"
EXON_NUMBER_FOUND = True
if user_in[x][0:13] == "SEVERE_IMPACT":
SEVERE_IMPACT_OUTPUT += user_in[x][14:] + "\t"
if GENE_ID_FOUND == True:
final_output += GENE_ID_OUTPUT
else:
final_output += "NA"
if GENE_NAME_FOUND == True:
final_output += GENE_NAME_OUTPUT
else:
final_output += "NA"
if EXON_NUMBER_FOUND == True:
final_output += EXON_NUMBER_OUTPUT
else:
final_output += "NA"
final_output += SEVERE_IMPACT_OUTPUT
print(final_output)