I have two text files. The 1st file contains English sentences and 2nd file contains a number of English words (vocabulary). I want to remove those words from the sentences in the 1st file which are not present in the vocabulary and then to save the processed text back into the 1st file.
I wrote the code from which I am able to get those sentences which contains the words that are not available in our 2nd file (vocabulary).
Here is my code:
s = open('eng.txt').readlines()
for i in s:
print(i)
for word in i.split(' '):
print(word)
if word in open("vocab30000.txt").read():
print("Word exist in vocab")
else:
#print("I:", i)
print("Word does not exist")
#search_in_file_func(i)
print("I:", i)
file1 = open("MyFile.txt","a+")
if i in file1:
print("Sentence already exist")
else:
file1.write(i)
However, I am not able to remove those words.
This should work:
with open('vocab30000.txt') as f:
vocabulary = set(word.strip() for word in f.readlines())
with open('eng.txt', 'r+') as f:
data = [line.strip().split(' ') for line in f.readlines()]
removed = [[word for word in line if word in vocabulary] for line in data]
result = '\n'.join(' '.join(word for word in line) for line in removed)
f.seek(0)
f.write(result)
f.truncate()
#Read the two files
with open('vocab30000.txt') as f:
vocabulary = f.readlines()
with open('eng.txt', 'r+') as f:
eng = f.readlines()
vocab_sentences = [i.split(" ") for i in vocabulary]
eng = [i.split(" ") for i in eng]
cleaned_sentences = []
# loop over the sentences and exclude words in eng
for sent in vocab_sentences:
cleaned_sentences.append(" ".join([i for i in sent if i not in eng]))
#write the file
with open('vocab30000.txt', 'w') as f:
f.writelines(cleaned_sentences)
You can try this code. I tried not to use any loops to save your runtime if you have larger files.
import re
with open('eng.txt', 'r') as f:
s = f.read()
s_copy = s
punctuation = [".","\"",",","-","(",")","[","]"]
pattern = re.compile("\\b("+"|".join(punctuation)+")\\W", re.I)
s_copy = pattern.sub(" ", s_copy)
s_copy = s_copy.replace("\"","")
s_words = s_copy.split(" ")
with open('vocab30000.txt', 'r') as f:
check_words = f.read()
remove_words = list(set(s_words) - set(check_words))
pattern = re.compile("\\b("+"|".join(remove_words[1:])+")\\W", re.I)
pattern.sub("", s)
Related
I am trying to use this script to filter words from a lexicon dictionary for my corpus. I want to train my ASR and I am using following script (http://www.eleanorchodroff.com/tutorial/kaldi/training-acoustic-models.html#create-files-for-datatrain)
I get the following error when I run the code:
File "/home/metanet/ProgramFiles/kaldi/kaldi/egs/ASR-for-urdu/s5/data/filterdict.py",
line 20, in <module>
pron = columns[1]
IndexError: list index out of range
The original code was not working as it is so I altered it. Here is the code
# filter_dict.py
#
#
# Created by Eleanor Chodroff on 2/22/15.
# This script filters out words which are not in our corpus.
# It requires a list of the words in the corpus: words.txt
import os
ref = dict()
phones = dict()
with open("lexicon.bak") as f:
for line in f:
line = line.strip()
columns = line.split(" ", 1)
word = columns[0]
pron = columns[1]
try:
ref[word].append(pron)
except:
ref[word] = list()
ref[word].append(pron)
print (ref)
lex = open("lexicon.txt", "wb")
lex.write("<oov> <oov>\n")
with open("words.txt") as f:
for line in f:
line = line.strip()
if line in ref.keys():
for pron in ref[line]:
lex.write(line + " " + pron+"\n")
else:
print ("Word not in lexicon:" + line)```
I have a code that counts every word in a file and counts how many times they occurred.
filename = "test.txt"
output = []
with open(filename) as f:
content = f.readlines()
content = [x.strip() for x in content]
wordlist = {}
for line in content:
for entry in line.split():
word = entry.replace('.', '')
word = word.replace(',', '')
word = word.replace('!', '')
word = word.replace('?', '')
if word not in wordlist:
wordlist[word] = 1
else:
wordlist[word] = wordlist[word] + 1
print(wordlist)
However, when I print this, I am not able to specify to go from high to low occurrences.
Here is a test file.
hello my friend. hello sir.
How do I print such that it looks like
hello: 2 (newline)
my: 1
etc?
from pathlib import Path
from collections import Counter
import string
filepath = Path('test.txt')
# print(filepath.exists())
with open(filepath) as f:
content = f.readlines()
word_list = sum((
(s.strip('\n').translate(str.maketrans('', '', string.punctuation))).split(' ')
for s in content
), [])
for key,value in Counter(word_list).items():
print(f'{key} : {value}')
In python3.7 and up the dict preserve the insertion order. So we can just sort the dictionary item by values and then insert(or create) in new dict.
Use:
print(dict(sorted(wordlist.items(), key = lambda x: -x[1])))
I'm trying to process data(remove hastag, link and #) from CSV files and stored it back to CSV. however the output does not perform well. it is separated with a comma for each character. Can anyone help me to write it to csv in a better way. thank you
import re,string
import csv
def strip_links(text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:##%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_all_entities(text):
entity_prefixes = ['#','#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
words = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
words.append(word)
return ' '.join(words)
f = open('Test.csv')
csf_f = csv.reader(f)
temp =[]
for row in csf_f:
temp.append(row[0])
temp1 = []
for t in temp:
temp1.append(strip_all_entities(strip_links(t)))
for i in range(0, len(temp1)):
with open('MYOUTPUT.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(temp1)
f.close()
I am attempting to loop through a series of text files in a directory, looking for occurences of certain types of words, and prefixing each found word with a user defined tag. My code is as follows.
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
My problem is that although code works on the last file in the directory it is not working on the previous files (1 out of 10 in this) I've tried a second For loop above the file write out statements but that is not working at all. Can anyone explain what I'm doing wrong here?
regards
My speculation is your code is only showing the last file because it's
not indented properly to have all relevant code within the for loop.
Try with this indentation:
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
Assuming all of your code is supposed to be in your for loop. You are overriding your text file, therefore it looks like only your last run is working:
#this overrides the file
with open(filename, "w") as fh:
fh.write(str(data))
change to:
#this append to the file
with open(filename, "a") as fh:
fh.write(str(data))
This will append to your text file and will not override previous added data with the data from the last loop.
I want to tokenize and sort reviews by keywords, but there is a problem with opening json and programs throw an error: JSONDecodeError: Extra data: line 1 column 884 (char 883).
The files test2.json and keywords.txt are here:
https://github.com/SilverYar/TransportDataMiner
Here is my code:
import nltk
from nltk.stem.snowball import RussianStemmer
from nltk.corpus import stopwords
import nltk, string, json
st = RussianStemmer()
def tokenize_me(file_text):
#applying nltk tokenization
tokens = nltk.word_tokenize(file_text)
#deleting punctuation symbols
tokens = [i for i in tokens if (i not in string.punctuation)]
#deleting stop_words
stop_words = stopwords.words('russian')
stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])
tokens = [i for i in tokens if (i not in stop_words)]
#cleaning words
tokens = [i.replace('«', '').replace('»', '') for i in tokens]
return tokens
with open('C:\\Creme\\token\\keywords.txt') as fin:
ww = fin.read().split(', ')
key_words = list(set([st.stem(w) for w in ww]))
with open('C:\\Creme\\token\\test2.json') as fin:
text = json.load(fin)
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
for dd in text:
#for d in dd:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
#break
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
json.dump(dd, fout)
fout.write('\n')
Help me find the error and fix the code.
You may not do this, because it leads to malformed JSON file:
for dd in text:
if tt:
json.dump(dd, fout) # <<-- cannot do this in the loop
fout.write('\n')
Basically it should be written all at once, with a single dump() or dumps() call.
ok, you have to make the large list first, then output it to the file:
bad_words_list = []
for dd in text:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
bad_words_list.append( dd )
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
json.dump( bad_words_list, fout )
I decided a little differently - read in the file and format the string into correct json formats:
with open('C:\\Creme\\token\\test2.json', 'r', encoding='utf8') as fin:
data = fin.read()
formated_text = data.replace('}{', '},{')
text = json.loads(f'[{formated_text}]')
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
for dd in text:
#for d in dd:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
#break
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
json.dump(dd, fout)
fout.write('\n')