Transform a code tokens list into valid string code

Transform a code tokens list into valid string code - python

I have written code to transform Python code into a list to compute BLEU score:
import re
def tokenize_for_bleu_eval(code):
code = re.sub(r'([^A-Za-z0-9_])', r' \1 ', code)
code = re.sub(r'([a-z])([A-Z])', r'\1 \2', code)
code = re.sub(r'\s+', ' ', code)
code = code.replace('"', '`')
code = code.replace('\'', '`')
tokens = [t for t in code.split(' ') if t]
return tokens
Thanks to this snippet my code struct.unpack('h', pS[0:2]) is parsed properly into the list ['struct', '.', 'unpack', '(', 'h', ',', 'p', 'S', '[', '0', ':', '2', ']', ')'].
Initially, I thought I need simply to use the ' '.join(list_of_tokens) but it kills my variable names like this struct . unpack ('h' , p S [ 0 : 2 ] ) and my code is not executable.
I tried to use Regex to stick some variable names but I can't succeed to reverse my function tokenize_for_bleu_eval to find executable code at the end. Is someone get an idea, perhaps without regex which seems to be too complicated here?
EDIT: We can't just remove all spaces between element of the list because there are examples like items = [item for item in container if item.attribute == value] where the result of the backtranslation without space would be itemforiteminaifitem[0]==1 which is not valid.

I am trying to merge the tokens using this script
import re
def tokenize_for_bleu_eval(code):
code = re.sub(r'([^A-Za-z0-9_])', r' \1 ', code)
code = re.sub(r'([a-z])([A-Z])', r'\1 \2', code)
code = re.sub(r'\s+', ' ', code)
code = code.replace('"', '`')
code = code.replace('\'', '`')
tokens = [t for t in code.split(' ') if t]
return tokens
def merge_tokens(tokens):
code = ''.join(tokens)
code = code.replace('`', "'")
code = code.replace(',', ", ")
return code
tokenize = tokenize_for_bleu_eval("struct.unpack('h', pS[0:2])")
print(tokenize) # ['struct', '.', 'unpack', '(', '`', 'h', '`', ',', 'p', 'S', '[', '0', ':', '2', ']', ')']
merge_result = merge_tokens(tokenize)
print(merge_result) # struct.unpack('h', pS[0:2])
Edit:
I found this interesting idea to tokenize and merge.
import re
def tokenize_for_bleu_eval(code):
tokens_list = []
codes = code.split(' ')
for i in range(len(codes)):
code = codes[i]
code = re.sub(r'([^A-Za-z0-9_])', r' \1 ', code)
code = re.sub(r'([a-z])([A-Z])', r'\1 \2', code)
code = re.sub(r'\s+', ' ', code)
code = code.replace('"', '`')
code = code.replace('\'', '`')
tokens = [t for t in code.split(' ') if t]
tokens_list.append(tokens)
if i != len(codes) -1:
tokens_list.append([' '])
flatten_list = []
for tokens in tokens_list:
for token in tokens:
flatten_list.append(token)
return flatten_list
def merge_tokens(flatten_list):
code = ''.join(flatten_list)
code = code.replace('`', "'")
return code
test1 ="struct.unpack('h', pS[0:2])"
test2 = "items = [item for item in container if item.attribute == value]"
tokenize = tokenize_for_bleu_eval(test1)
print(tokenize) # ['struct', '.', 'unpack', '(', '`', 'h', '`', ',', ' ', 'p', 'S', '[', '0', ':', '2', ']', ')']
merge_result = merge_tokens(tokenize)
print(merge_result) # struct.unpack('h', pS[0:2])
tokenize = tokenize_for_bleu_eval(test2)
print(tokenize) # ['items', ' ', '=', ' ', '[', 'item', ' ', 'for', ' ', 'item', ' ', 'in', ' ', 'container', ' ', 'if', ' ', 'item', '.', 'attribute', ' ', '=', '=', ' ', 'value', ']']
merge_result = merge_tokens(tokenize)
print(merge_result) # items = [item for item in container if item.attribute == value]
This script will also remember each space from the input

Related

restart the class with initial argument value python

I am trying to do a class to manipulate a nested list ,
a_list = [ a, b, c, d, e, f, g] will be an initial argument , then I will have some other methods to manipulate the list. However, I will need to keep a copy of the original list, that by anytime I can restart it to the initial stage by calling the restart() method ,
i tried a couple of times but it didn't work, can anyone give me a hand?
class Sample:
def __init__(self, a_list, count):
self.__a_list = a_list
self.__count = 0
self.__reset_list = a_list.copy()
def restart(self):
self.__a_list = self.__reset_list
return
This is a full test case, it did not get back to the original value:
class Sample:
def __init__(self, a_list, count=0):
self.__a_list = a_list
self.__count = 0
self.__reset_list = a_list.copy()
def set_update(self):
self.__a_list[2][2] = "S"
def restart(self):
self.__a_list = self.__reset_list
return
def __str__(self):
result_list = self.__a_list
result = ''
for i in range(len(result_list)):
for j in range(len(result_list[i])):
result += result_list[i][j] + ' '
if i < len(result_list) -1:
result += '\n'
return result
test = [
['*', '*', '*', '*', '*', '*', '*', '*'],
['*', ' ', ' ', ' ', ' ', ' ', ' ', ' '],
['*', ' ', ' ', '#', 'P', ' ', ' ', '*'],
['*', '*', '*', '*', '*', ' ', '#', '*'],
['*', 'o', ' ', ' ', ' ', ' ', ' ', '*'],
['*', ' ', ' ', ' ', ' ', ' ', 'o', '*'],
['*', '*', '*', ' ', '*', '*', '*', '*']
]
a_case = Sample(test)
print(a_case)
a_case.set_update()
print(a_case)
a_case.restart()
print(a_case)

You need to copy the list on restart:
def restart(self):
self.__a_list = self.__reset_list.copy()
Otherwise __a_list points to the same list as __reset_list, and the latter will reflect any modifications made to __a_list

One thing i noticed was, this issue is not happening in 1D array. as a solution you can use deepcopy. check the answer below.
import copy
class Sample:
def __init__(self, a_list, count=0):
self.__a_list = a_list
self.__count = 0
self.__reset_list = copy.deepcopy(a_list) #use deepcopy to copy the array

Finally, I found a solution by using the nested list to nested tuple conversion
using tuples
class Sample:
def __init__(self, a_list, count=0):
self.__a_list = a_list
self.__count = 0
self.__reset_list = a_list.copy()
self.__tracks = []
def set_update(self,i,j):
current_copy = tuple(tuple(i) for i in self.__a_list)
self.__tracks.append(current_copy)
self.__a_list[i][j] = "S"
def restart(self):
initial_copy = [list(i) for i in self.__tracks[0]]
self.__a_list = initial_copy
while len(self.__tracks) > 0:
self.__tracks.pop()
return self.__a_list
def undo(self):
self.__tracks.pop()
last_copy = [list(i) for i in self.__tracks[-1]]
print(last_copy)
self.__a_list = last_copy
self.__tracks.pop()
return self.__a_list
def __str__(self):
result_list = self.__a_list
result = ''
for i in range(len(result_list)):
for j in range(len(result_list[i])):
result += result_list[i][j] + ' '
if i < len(result_list) -1:
result += '\n'
return result
test = [
['*', '*', '*', '*', '*', '*', '*', '*'],
['*', ' ', ' ', ' ', ' ', ' ', ' ', ' '],
['*', ' ', ' ', '#', 'P', ' ', ' ', '*'],
['*', '*', '*', '*', '*', ' ', '#', '*'],
['*', 'o', ' ', ' ', ' ', ' ', ' ', '*'],
['*', ' ', ' ', ' ', ' ', ' ', 'o', '*'],
['*', '*', '*', ' ', '*', '*', '*', '*']
]
a_case = Sample(test)
print(a_case)
a_case.set_update(2,2)
print(a_case)
a_case.set_update(2,3)
print(a_case)
a_case.set_update(3,3)
print(a_case)
a_case.undo()
print(a_case)
a_case.restart()
print(a_case)

how to remove the empty quotes from the list using python?

I have a python script that pre-processes the text before I can make the text analysis. Some of the functions to clean the text are:
to remove strings that are less than two characters.
to tokenize the text
The problem is that the first function returns a list and the second takes the returned list and also returns a list so it becomes list inside list. Like this:
['[', "'الموضوع", "'", ',', "'إجتماع", "'", ',', "'بين", "'", ',',
"'الجنة", "'", ',', "'البحرية", "'", ',', "'الفرعية", "'", ',',]']
where the result must be like this :
['الموضوع', 'إجتماع', 'بين', 'الجنة', 'البحرية', 'الفرعية',]
the returned result of the remove stop word :
['ا', 'ل', 'م', 'و', 'ض', 'و', 'ع', ' ', 'إ', 'ج', 'ت', 'م', 'ا', 'ع', ' ', 'ب', 'ي', 'ن', ' ', 'ا', 'ل', 'ج', 'ن', 'ة', ' ', 'ا', 'ل', 'ب', 'ح', 'ر', 'ي', 'ة', ' ', 'ا', 'ل', 'ف', 'ر', 'ع', 'ي', 'ة', ' ', 'و', 'ن', 'ظ', 'ي', 'ر', 'ت', 'ه', 'ا', ' ', 'ف', 'ي', ' ', 'م', 'ب', 'س', 'و', 'ط', ' ', 'ا', 'ل', 'م', 'س', 'ت', 'ن', 'د', ' ', 'ب', 'ر', 'ق', 'ي', 'ة', ' ', 'ر', 'ق', 'م', ' ', '1', '7', '1', 'ع', ' ', 'ت', 'ا', 'ر', 'ي', 'خ', ' ', '1', '2', '1', ]
where the result must be:
['تاريخ', '1212019','الموضوع', 'إجتماع', 'بين', 'الجنة', 'البحرية', 'الفرعية',]
code
def remove_1char(text):
tokens = text.split()
tokens = [word for word in tokens if len(word) > 1]
result = str(tokens)
write_file("remove_1char.txt",result)
return result
def tokenize_text(text):
tokens=word_tokenize(text)
write_file("tokenize_text.txt",tokens)
return tokens
def remove_stopwords(tokens):
write_file("tokens_before_remove.txt",tokens)
stop_word_list = set(stopwords.words('arabic'))
clean_tokens= [tok for tok in tokens if tok not in stop_word_list]
write_file("remove_stop_word.txt",clean_tokens)
return clean_tokens
def clean_text(text):
rmws = remove_whiteSpace(text)
rmp = remove_punctuations(rmws)
rmd = remove_diacritics(rmp)
rmrc = remove_repeating_char(rmd)
rm1c = remove_1char(rmrc)
clean_tokens = remove_stopwords(rm1c)
write_file("result.txt",clean_tokens)
return clean_tokens
So how to fix this problem?

Let's open a Python REPL and go through your code.
I assume the first line specifies the input string, to assign it to a variable.
>>> l = ['الموضوع', 'إجتماع', 'بين', 'الجنة', 'البحرية', 'الفرعية',]
>>> l
['الموضوع', 'إجتماع', 'بين', 'الجنة', 'البحرية', 'الفرعية']
You didn't specify which function is called, but I assume you first call the function remove_1char for each input string. We will call the lines of the function one after another an view the results for the first item of the list, which we will call text.
>>> text = l[0]
>>> tokens = text.split()
>>> tokens
>>> ['الموضوع']
Since every word in the input sequence consists of one word, the output is as expected.
>>> tokens = [word for word in tokens if len(word) > 1]
>>> tokens
['الموضوع']
And all words have more then 1 character. Also as expected.
>>> result = str(tokens)
>>> result
"['الموضوع']"
>>>
In this line the string representation of the list is assigned to result. That's probably not what you want. I think you want to connect the tokens to a single string. This can be done with the join function.
>>> result = ' '.join(tokens)
>>> result
'الموضوع'
>>>

Issue using regex to split strings with a list of delimiters

I am using this function to split a text in words and separators while preserving them
import re
def split_text_in_words(phrase_text, separators=[" "]):
separator_regex = """({0})""".format("""|""".join(separators))
return [f for f in re.split(separator_regex,phrase_text) if len(f) > 0]
I am using this code like this:
>>> split_text_in_words('Mary & his family has a?nice.house at #157, at the beach? Of course! it is great. I owe her 40$ so I plan to pay my debt weekly at 3% interest :) "no comment"', separators=[' ', '\?', '\*', '\.', ',', ';', ':', "'", '"', '-', '\?', '!', '#', '\$', '%', '^', '&'])
['Mary', ' ', '&', ' ', 'his', ' ', 'family', ' ', 'has', ' ', 'a', '?', 'nice', '.', 'house', ' ', 'at', ' ', '#', '157', ',', ' ', 'at', ' ', 'the', ' ', 'beach', '?', ' ', 'Of', ' ', 'course', '!', ' ', 'it', ' ', 'is', ' ', 'great', '.', ' ', 'I', ' ', 'owe', ' ', 'her', ' ', '40', '$', ' ', 'so', ' ', 'I', ' ', 'plan', ' ', 'to', ' ', 'pay', ' ', 'my', ' ', 'debt', ' ', 'weekly', ' ', 'at', ' ', '3', '%', ' ', 'interest', ' ', ':', ')', ' ', '"', 'no', ' ', 'comment', '"']
This looks good so far and is precisely what I want. However when adding parens on the list of separators and I happen to have the text starting with a parens, the splitting gears don't kick in:
>>> split_text_in_words('(as if it was not aware) Mary & his family has a?nice beach* house at #157, at the beach? Of course! it is great. I owe her 40$ so I plan to pay my debt weekly at 3% interest :) "no comment"', separators=[' ', '\?', '\*', '\.', ',', ';', ':', "'", '"', '-', '\?', '!', '#', '\$', '%', '^', '&', '\*', '\(', '\)'])
['(as', ' ', 'if', ' ', 'it', ' ', 'was', ' ', 'not', ' ', 'aware', ')', ' ', 'Mary', ' ', '&', ' ', 'his', ' ', 'family', ' ', 'has', ' ', 'a', '?', 'nice', ' ', 'beach', '*', ' ', 'house', ' ', 'at', ' ', '#', '157', ',', ' ', 'at', ' ', 'the', ' ', 'beach', '?', ' ', 'Of', ' ', 'course', '!', ' ', 'it', ' ', 'is', ' ', 'great', '.', ' ', 'I', ' ', 'owe', ' ', 'her', ' ', '40', '$', ' ', 'so', ' ', 'I', ' ', 'plan', ' ', 'to', ' ', 'pay', ' ', 'my', ' ', 'debt', ' ', 'weekly', ' ', 'at', ' ', '3', '%', ' ', 'interest', ' ', ':', ')', ' ', '"', 'no', ' ', 'comment', '"']
The first parens remains attached to the word. I can work around this issue by simply appending a space at beginning:
>>> split_text_in_words(' (as if it was not aware) Mary & his family has a?nice beach* house at #157, at the beach? Of course! it is great. I owe her 40$ so I plan to pay my debt weekly at 3% interest :) "no comment"', separators=[' ', '\?', '\*', '\.', ',', ';', ':', "'", '"', '-', '\?', '!', '#', '\$', '%', '^', '&', '\*', '\(', '\)'])
[' ', '(', 'as', ' ', 'if', ' ', 'it', ' ', 'was', ' ', 'not', ' ', 'aware', ')', ' ', 'Mary', ' ', '&', ' ', 'his', ' ', 'family', ' ', 'has', ' ', 'a', '?', 'nice', ' ', 'beach', '*', ' ', 'house', ' ', 'at', ' ', '#', '157', ',', ' ', 'at', ' ', 'the', ' ', 'beach', '?', ' ', 'Of', ' ', 'course', '!', ' ', 'it', ' ', 'is', ' ', 'great', '.', ' ', 'I', ' ', 'owe', ' ', 'her', ' ', '40', '$', ' ', 'so', ' ', 'I', ' ', 'plan', ' ', 'to', ' ', 'pay', ' ', 'my', ' ', 'debt', ' ', 'weekly', ' ', 'at', ' ', '3', '%', ' ', 'interest', ' ', ':', ')', ' ', '"', 'no', ' ', 'comment', '"']
But I'm concerned why this happening and if the strategy (hack, really) of adding a space at the beginning does not reassure me that it won't fail in some other more subtle case
Why is this happening, and would the hack/fix of appending a space at the beginning work in general?

The problem is the unescaped ^. You should probably escape all punctuation characters you use with something like:
split_text_in_words(
'(as if it was not aware) Mary & his family',
separators=["\\" + c for c in " ?*.,;:'\"-!#$%^&()"]
)
Maybe, even do it in the function:
import re
def split_text_in_words(phrase_text, separators=[" "]):
inter = "|".join(
re.sub(r"(^|[^\\])([^A-Za-z0-9])", r"\\\2", sep) for sep in separators
)
# Add the backslash if not already present for every non-alphanumeric
# character.
separator_regex = "({0})".format(inter)
return [f for f in re.split(separator_regex, phrase_text) if len(f) > 0]

Problem is use of unescaped ^ in your separator that becomes part of your splitting regex. ^ is a special regex meta-character that means start anchor.
You must escape it as this:
separators=[' ', '\?', '\*', '\.', ',', ';', ':', "'", '"', '-', '\?', '!', '#', '\$', '%', '\^', '&', '\*', '\(', '\)']

^ marks the beginning of the string so it must be escaped in the separator list: '\^'
A more comfortable and safer way would be to not escape the separators in the parameter but in the function instead:
separator_regex = """({0})""".format("""|""".join(map(re.escape, separators)))

Remove stopwords from sentences

I'd want to remove stopwords from a sentence.
I've this piece of code:
splitted = text.split()
for index, word in enumerate(splitted):
if word in self.stopWords:
del splitted[index]
text = " ".join(splitted)
stopWords is updated with this instruction self.stopWords.update(['.', ',', "\"", "\'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '),', '],', '},', '",', "',", '")', '"]', '"}', "-", "--", '\".', "\'.", '/', ').', '-', '--', '%', '°\'', '(-', '("', '."', '.),', ');', '–', '$', 'a']) but, for example, the letter 'a', such as '.', or ';' isn't deleted from the sentence.
What would I do?

I think it's easier to use list comprehension (or generator expression as I do here):
' '.join(w for w in text.split() if w not in stop_words)

Can you try my code? If you have any question about code,just ask to me.
def splitFile(lines,splitvalue):
documents={};
documentCount=1
dcmnt="";
for line in lines:
dcmnt+=line;
if (line.__contains__(splitvalue)):
key="documents"+(str)(documentCount);
documents[key]=dcmnt;
dcmnt="";
documentCount=documentCount+1;
return documents;
documentswords = []
with open('reuter10.txt','r') as f: #reading a text file and splitting it into single words
for line in f:
for word in line.split():
documentswords.append(word)
stopwords=[]
with open('stopwords.txt','r') as f: #reading a text file and splitting it into single words
for line in f:
for word in line.split():
stopwords.append(word)
readFile=open("reuter10.txt","r");
lines=readFile.readlines();
readFile.close();
alldocuments=splitFile(lines, "</reuters>");
temp=[]
for i in range(0,documentswords.__len__()):
count = 0;
for ii in range(0, stopwords.__len__()):
if documentswords[i]==stopwords[ii]:
count=count+1
if ii+1==stopwords.__len__() and count==0:
temp.append(documentswords[i])
print("")
print("*****PRINTING WORDS WITHOUT STOPWORDS*****")
print("")
for i in range(0, temp.__len__()): #printing words without stopwords
print(temp[i]);

PYTHON How to count letters in words without special characters

I have a code that counts letters in words excluding special characters at the end. I just can't figure out a way to get it to exclude special character at the beginning also.
My code so far:
inFile = open( 'p.txt', "r" ).readlines()
myResults = []
for i in range( 20 ):
myResults.append( 0 )
mySpecialList = [ '-', '+', '#', '#', '!', '(', ')', '?', '.', ',', ':', ';', '"', "'", '`' ]
for line in inFile:
words = str.split( line )
for word in words:
if word not in mySpecialList:
if word[ -1 ] not in mySpecialList :
myResults[ len( word ) ] += 1
else :
myResults[ len( word ) - 1 ] += 1
print( myResults )

Here is some simple code to count all the alpha numeric letters of a single word.
word = "Hello World!"
count = 0
for c in word:
if c.isalnum():
count +1
print( count )
If you wanted to use your special characters you could adapt the code to look like
mySpecialList = ['*', '!']
word = "Hello World!"
count = 0
for c in word:
if c not in mySpecialList:
count +1
print( count )

You can use regular expressions, try it!
For example you can split string and after findall you have a list with all words.
import re
string = "Hello World, Hi + Say"
print(re.findall(r"[\w']+", string))

def reverseword(user_input):
words=str(user_input).split(" ")
newWords = [word[::-1] for word in words]
newSentence = " ".join(newWords)
return newSentence
if __name__ == "__main__":
while True:
ispresent=0
splcharlist=['-', '+', '#', '#', '!', '(', ')', '?', '.', ',', ':', ';', '"', "'", '`'," "]
user_input=input("Enter the input:")
print(len(user_input))
ccount=0
new_input=""
ch_count=0
if len(user_input)>100:
for eletter in user_input:
if eletter not in splcharlist:
ccount=ccount+1
ch_count=ch_count+1
if ccount>100:
break
new_input=user_input[:100]
else:
new_input=user_input
print("This is for your input:",user_input)
print("input with limit :"+str(new_input))
print(len(new_input))
print("The Reverse lists is: ",reverseword(new_input))
if "stop" in user_input:
break

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Transform a code tokens list into valid string code - python

Related

restart the class with initial argument value python

how to remove the empty quotes from the list using python?

Issue using regex to split strings with a list of delimiters

Remove stopwords from sentences

PYTHON How to count letters in words without special characters

Categories

Resources