I am attempting to loop through a series of text files in a directory, looking for occurences of certain types of words, and prefixing each found word with a user defined tag. My code is as follows.
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
My problem is that although code works on the last file in the directory it is not working on the previous files (1 out of 10 in this) I've tried a second For loop above the file write out statements but that is not working at all. Can anyone explain what I'm doing wrong here?
regards
My speculation is your code is only showing the last file because it's
not indented properly to have all relevant code within the for loop.
Try with this indentation:
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
Assuming all of your code is supposed to be in your for loop. You are overriding your text file, therefore it looks like only your last run is working:
#this overrides the file
with open(filename, "w") as fh:
fh.write(str(data))
change to:
#this append to the file
with open(filename, "a") as fh:
fh.write(str(data))
This will append to your text file and will not override previous added data with the data from the last loop.
Related
okay. you didnt understand anything from the title. let me explain.
now ı have a file. There is some text in this file. for example "jack.123 jackie.321"
I want to check if the word jack exists in the file and ı wanna print "jack.123".
its my problem. ı didnt print all text.
def append(name,password):
f = open("myfile.txt", "w")
f.write("{},{}".format(name,password))
append("jack",".123")
append("jackie" , ".321")
f = open("myfile.txt" ,"r")
if "jack" in f.read():
print("query found")
Open the file and read all its contents then split on whitespace. That effectively gives you all the words in the file.
Iterate over the list of words checking to see if a word starts with the name you're searching for followed by '.'.
Note that there may be more than one occurrence so build a list.
def find_name(filename, name):
if not name[-1] == '.':
name += '.'
found = []
with open(filename) as myfile:
for word in myfile.read().split():
if word.startswith(name):
found.append(word)
return found
print(*find_name('myfile.txt', 'jack'))
def new_pass(name, passwd):
"creates file and write name and passwd to it"
with open("myfile1.txt", "a") as f:
f.write(name + "." + passwd + "\n")
new_pass("jack", "123")
new_pass("jack", "183")
new_pass("jack", "129")
new_pass("jack", "223")
def check_word(file, word):
"""checks if a word exists and returns its first occurence """
with open(file) as f:
l = f.read().split("\n")
for i in l:
if i.startswith(word):
print("query found")
return i
print(check_word("myfile1.txt", "jack"))
In python you can do it like that :
with open('file.txt', 'r') as file:
data = file.read()
if "jack" in data:
print("jack")
If I understood uncorrectly let me know
New to Python and I'm trying to count the words in a directory of text files and write the output to a separate text file. However, I want to specify conditions. So if word count is > 0 is would like to write the count and file path to one file and if the count is == 0. I would like to write the count and file path to a separate file. Below is my code so far. I think I'm close, but I'm hung up on how to do the conditions and separate files. Thanks.
import sys
import os
from collections import Counter
import glob
stdoutOrigin=sys.stdout
sys.stdout = open("log.txt", "w")
def count_words_in_dir(dirpath, words, action=None):
for filepath in glob.iglob(os.path.join("path", '*.txt')):
with open(filepath) as f:
data = f.read()
for key,val in words.items():
#print("key is " + key + "\n")
ct = data.count(key)
words[key] = ct
if action:
action(filepath, words)
def print_summary(filepath, words):
for key,val in sorted(words.items()):
print(filepath)
if val > 0:
print('{0}:\t{1}'.format(
key,
val))
filepath = sys.argv[1]
keys = ["x", "y"]
words = dict.fromkeys(keys,0)
count_words_in_dir(filepath, words, action=print_summary)
sys.stdout.close()
sys.stdout=stdoutOrigin
I would strongly urge you to not repurpose stdout for writing data to a file as part of the normal course of your program. I also wonder how you can ever have a word "count < 0". I assume you meant "count == 0".
The main problem that your code has is in this line:
for filepath in glob.iglob(os.path.join("path", '*.txt')):
The string constant "path" I'm pretty sure doesn't belong there. I think you want filepath there instead. I would think that this problem would prevent your code from working at all.
Here's a version of your code where I fixed these issues and added the logic to write to two different output files based on the count:
import sys
import os
import glob
out1 = open("/tmp/so/seen.txt", "w")
out2 = open("/tmp/so/missing.txt", "w")
def count_words_in_dir(dirpath, words, action=None):
for filepath in glob.iglob(os.path.join(dirpath, '*.txt')):
with open(filepath) as f:
data = f.read()
for key, val in words.items():
# print("key is " + key + "\n")
ct = data.count(key)
words[key] = ct
if action:
action(filepath, words)
def print_summary(filepath, words):
for key, val in sorted(words.items()):
whichout = out1 if val > 0 else out2
print(filepath, file=whichout)
print('{0}: {1}'.format(key, val), file=whichout)
filepath = sys.argv[1]
keys = ["country", "friend", "turnip"]
words = dict.fromkeys(keys, 0)
count_words_in_dir(filepath, words, action=print_summary)
out1.close()
out2.close()
Result:
file seen.txt:
/Users/steve/tmp/so/dir/data2.txt
friend: 1
/Users/steve/tmp/so/dir/data.txt
country: 2
/Users/steve/tmp/so/dir/data.txt
friend: 1
file missing.txt:
/Users/steve/tmp/so/dir/data2.txt
country: 0
/Users/steve/tmp/so/dir/data2.txt
turnip: 0
/Users/steve/tmp/so/dir/data.txt
turnip: 0
(excuse me for using some search words that were a bit more interesting than yours)
Hello I hope I understood your question correctly, this code will count how many different words are in your file and depending on the conditions will do something you want.
import os
all_words = {}
def count(file_path):
with open(file_path, "r") as f:
# for better performance it is a good idea to go line by line through file
for line in f:
# singles out all the words, by splitting string around spaces
words = line.split(" ")
# and checks if word already exists in all_words dictionary...
for word in words:
try:
# ...if it does increment number of repetitions
all_words[word.replace(",", "").replace(".", "").lower()] += 1
except Exception:
# ...if it doesn't create it and give it number of repetitions 1
all_words[word.replace(",", "").replace(".", "").lower()] = 1
if __name__ == '__main__':
# for every text file in your current directory count how many words it has
for file in os.listdir("."):
if file.endswith(".txt"):
all_words = {}
count(file)
n = len(all_words)
# depending on the number of words do something
if n > 0:
with open("count1.txt", "a") as f:
f.write(file + "\n" + str(n) + "\n")
else:
with open("count2.txt", "a") as f:
f.write(file + "\n" + str(n) + "\n")
if you want to count same word multiple times you can add up all values from dictionary or you can eliminate try-except block and count every word there.
I'm trying to process data(remove hastag, link and #) from CSV files and stored it back to CSV. however the output does not perform well. it is separated with a comma for each character. Can anyone help me to write it to csv in a better way. thank you
import re,string
import csv
def strip_links(text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:##%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_all_entities(text):
entity_prefixes = ['#','#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
words = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
words.append(word)
return ' '.join(words)
f = open('Test.csv')
csf_f = csv.reader(f)
temp =[]
for row in csf_f:
temp.append(row[0])
temp1 = []
for t in temp:
temp1.append(strip_all_entities(strip_links(t)))
for i in range(0, len(temp1)):
with open('MYOUTPUT.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(temp1)
f.close()
I have two text files. The 1st file contains English sentences and 2nd file contains a number of English words (vocabulary). I want to remove those words from the sentences in the 1st file which are not present in the vocabulary and then to save the processed text back into the 1st file.
I wrote the code from which I am able to get those sentences which contains the words that are not available in our 2nd file (vocabulary).
Here is my code:
s = open('eng.txt').readlines()
for i in s:
print(i)
for word in i.split(' '):
print(word)
if word in open("vocab30000.txt").read():
print("Word exist in vocab")
else:
#print("I:", i)
print("Word does not exist")
#search_in_file_func(i)
print("I:", i)
file1 = open("MyFile.txt","a+")
if i in file1:
print("Sentence already exist")
else:
file1.write(i)
However, I am not able to remove those words.
This should work:
with open('vocab30000.txt') as f:
vocabulary = set(word.strip() for word in f.readlines())
with open('eng.txt', 'r+') as f:
data = [line.strip().split(' ') for line in f.readlines()]
removed = [[word for word in line if word in vocabulary] for line in data]
result = '\n'.join(' '.join(word for word in line) for line in removed)
f.seek(0)
f.write(result)
f.truncate()
#Read the two files
with open('vocab30000.txt') as f:
vocabulary = f.readlines()
with open('eng.txt', 'r+') as f:
eng = f.readlines()
vocab_sentences = [i.split(" ") for i in vocabulary]
eng = [i.split(" ") for i in eng]
cleaned_sentences = []
# loop over the sentences and exclude words in eng
for sent in vocab_sentences:
cleaned_sentences.append(" ".join([i for i in sent if i not in eng]))
#write the file
with open('vocab30000.txt', 'w') as f:
f.writelines(cleaned_sentences)
You can try this code. I tried not to use any loops to save your runtime if you have larger files.
import re
with open('eng.txt', 'r') as f:
s = f.read()
s_copy = s
punctuation = [".","\"",",","-","(",")","[","]"]
pattern = re.compile("\\b("+"|".join(punctuation)+")\\W", re.I)
s_copy = pattern.sub(" ", s_copy)
s_copy = s_copy.replace("\"","")
s_words = s_copy.split(" ")
with open('vocab30000.txt', 'r') as f:
check_words = f.read()
remove_words = list(set(s_words) - set(check_words))
pattern = re.compile("\\b("+"|".join(remove_words[1:])+")\\W", re.I)
pattern.sub("", s)
I read from a file, if it finds a ".", it should add a newline "\n" to the text and write it back to the file. I tried this code but still have the problem.
inp = open('rawCorpus.txt', 'r')
out = open("testFile.text", "w")
for line in iter(inp):
l = line.split()
if l.endswith(".")
out.write("\n")
s = '\n'.join(l)
print(s)
out.write(str(s))
inp.close()
out.close()
Try This ( Normal way ):
with open("rawCorpus.txt", 'r') as read_file:
raw_data = read_file.readlines()
my_save_data = open("testFile.text", "a")
for lines in raw_data:
if "." in lines:
re_lines = lines.replace(".", ".\r\n")
my_save_data.write(re_lines)
else:
my_save_data.write(lines + "\n")
my_save_data.close()
if your text file is not big you can try this too :
with open("rawCorpus.txt", 'r') as read_file:
raw_data = read_file.read()
re_data = raw_data.replace(".", ".\n")
with open("testFile.text", "w") as save_data:
save_data.write(re_data)
UPDATE ( output new lines depends on your text viewer too! because in some text editors "\n" is a new line but in some others "\r\n" is a new line. ) :
input sample :
This is a book. i love it.
This is a apple. i love it.
This is a laptop. i love it.
This is a pen. i love it.
This is a mobile. i love it.
Code:
last_buffer = []
read_lines = [line.rstrip('\n') for line in open('input.txt')]
my_save_data = open("output.txt", "a")
for lines in read_lines:
re_make_lines = lines.split(".")
for items in re_make_lines:
if items.replace(" ", "") == "":
pass
else:
result = items.strip() + ".\r\n"
my_save_data.write(result)
my_save_data.close()
Ouput Will Be :
This is a book.
i love it.
This is a apple.
i love it.
This is a laptop.
i love it.
This is a pen.
i love it.
This is a mobile.
i love it.
You are overwriting the string s in every loop with s = '\n'.join(l).
Allocate s = '' as empty string before the for-loop and add the new lines during every loop, e.g. with s += '\n'.join(l) (short version of s = s + '\n'.join(l)
This should work:
inp = open('rawCorpus.txt', 'r')
out = open('testFile.text', 'w')
s = '' # empty string
for line in iter(inp):
l = line.split('.')
s += '\n'.join(l) # add new lines to s
print(s)
out.write(str(s))
inp.close()
out.close()
Here is my own solution, but still I want one more newline after ".", that this solution not did this
read_lines = [line.rstrip('\n') for line in open('rawCorpus.txt')]
words = []
my_save_data = open("my_saved_data.txt", "w")
for lines in read_lines:
words.append(lines)
for word in words:
w = word.rstrip().replace('.', '\n.')
w = w.split()
my_save_data.write(str("\n".join(w)))
print("\n".join(w))
my_save_data.close()