I have a Python script that takes in '.html' files removes stop words and returns all other words in a python dictionary. But if the same word occurs in multiple files I want it to return only once. i.e. contain non-stop words, each only once.
def run():
filelist = os.listdir(path)
regex = re.compile(r'.*<div class="body">(.*?)</div>.*', re.DOTALL | re.IGNORECASE)
reg1 = re.compile(r'<\/?[ap][^>]*>', re.DOTALL | re.IGNORECASE)
quotereg = re.compile(r'"', re.DOTALL | re.IGNORECASE)
puncreg = re.compile(r'[^\w]', re.DOTALL | re.IGNORECASE)
f = open(stopwordfile, 'r')
stopwords = f.read().lower().split()
filewords = {}
htmlfiles = []
for file in filelist:
if file[-5:] == '.html':
htmlfiles.append(file)
totalfreq = {}
for file in htmlfiles:
f = open(path + file, 'r')
words = f.read().lower()
words = regex.findall(words)[0]
words = quotereg.sub(' ', words)
words = reg1.sub(' ', words)
words = puncreg.sub(' ', words)
words = words.strip().split()
for w in stopwords:
while w in words:
words.remove(w)
freq = {}
for w in words:
words=words
print words
if __name__ == '__main__':
run()
Use a set. Simply add every word you find to the set; it ignores duplicates.
Assuming you have an iterator that returns each word in a file (this is for plain text; HTML would be rather more complicated):
def words(filename):
with open(filename) as wordfile:
for line in wordfile:
for word in line.split():
yield word
Then getting them into a set is simple:
wordlist = set(words("words.txt"))
If you have multiple files, just do like so:
wordlist = set()
wordfiles = ["words1.txt", "words2.txt", "words3.txt"]
for wordfile in wordfiles:
wordlist |= set(words(wordfile))
You can also use a set for your stop words. Then you can simply subtract them from the word list after the fact, which will probably be faster than checking to see if each word is a stop word before adding.
stopwords = set(["a", "an", "the"])
wordlist -= stopwords
Related
I have a code that counts every word in a file and counts how many times they occurred.
filename = "test.txt"
output = []
with open(filename) as f:
content = f.readlines()
content = [x.strip() for x in content]
wordlist = {}
for line in content:
for entry in line.split():
word = entry.replace('.', '')
word = word.replace(',', '')
word = word.replace('!', '')
word = word.replace('?', '')
if word not in wordlist:
wordlist[word] = 1
else:
wordlist[word] = wordlist[word] + 1
print(wordlist)
However, when I print this, I am not able to specify to go from high to low occurrences.
Here is a test file.
hello my friend. hello sir.
How do I print such that it looks like
hello: 2 (newline)
my: 1
etc?
from pathlib import Path
from collections import Counter
import string
filepath = Path('test.txt')
# print(filepath.exists())
with open(filepath) as f:
content = f.readlines()
word_list = sum((
(s.strip('\n').translate(str.maketrans('', '', string.punctuation))).split(' ')
for s in content
), [])
for key,value in Counter(word_list).items():
print(f'{key} : {value}')
In python3.7 and up the dict preserve the insertion order. So we can just sort the dictionary item by values and then insert(or create) in new dict.
Use:
print(dict(sorted(wordlist.items(), key = lambda x: -x[1])))
I am using HDBSCAN algorithm to create clusters from the documents I have. But to create a vector matrix from the words, I am using tf-idf algorithm and want to use GloVe or Word2vec(because tf-idf based on BoW, so it can`t capture semantics).
Which method can I use - GloV, Word2vec or any other methods that will be appropriated for text clusterization?
And how I can implement it?
Any help will be highly appreciated!
nltk.download('stopwords')
title = []
synopses = []
filename = "twitter-test-dataset.csv"
num_clusters = 10
pkl_file = "doc_cluster.pkl"
generate_pkl = False
# pre-process data
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
fields = csvreader.next()
# extracting each data row one by one
duplicates = 0
for row in csvreader:
# removes the characters specified
line = re.sub(r'[.,"!]+', '', row[2], flags=re.MULTILINE)
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '',
line, flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = (re.sub(
"(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line, flags=re.MULTILINE))
line = filter(lambda x: x in string.printable,
line) # filter non-ascii characers
if line not in synopses:
synopses.append(line)
title.append(row[2])
else:
duplicates += 1
print("Removed " + str(duplicates) + " rows")
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(
text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for**strong text** token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def tokenize_only(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word.lower() for sent in nltk.sent_tokenize(text)
for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
# for each item in 'synopses', tokenize/stem
allwords_stemmed = tokenize_and_stem(i)
# extend the 'totalvocab_stemmed' list
totalvocab_stemmed.extend(allwords_stemmed)
allwords_tokenized = tokenize_only(i)
totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame(
{'words': totalvocab_tokenized}, index=totalvocab_stemmed)
# print "there are " + str(vocab_frame.shape[0]) + " items in vocab_frame"
# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=0.0, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
#CREATE TFIDF MATRIX
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
terms = tfidf_vectorizer.get_feature_names()
c = hdbscan.HDBSCAN(min_cluster_size=5)
#PASS TFIDF_MATRIX TO HDBSCAN
c.fit(tfidf_matrix)
print(c.labels_)
sys.exit()
I have two text files. The 1st file contains English sentences and 2nd file contains a number of English words (vocabulary). I want to remove those words from the sentences in the 1st file which are not present in the vocabulary and then to save the processed text back into the 1st file.
I wrote the code from which I am able to get those sentences which contains the words that are not available in our 2nd file (vocabulary).
Here is my code:
s = open('eng.txt').readlines()
for i in s:
print(i)
for word in i.split(' '):
print(word)
if word in open("vocab30000.txt").read():
print("Word exist in vocab")
else:
#print("I:", i)
print("Word does not exist")
#search_in_file_func(i)
print("I:", i)
file1 = open("MyFile.txt","a+")
if i in file1:
print("Sentence already exist")
else:
file1.write(i)
However, I am not able to remove those words.
This should work:
with open('vocab30000.txt') as f:
vocabulary = set(word.strip() for word in f.readlines())
with open('eng.txt', 'r+') as f:
data = [line.strip().split(' ') for line in f.readlines()]
removed = [[word for word in line if word in vocabulary] for line in data]
result = '\n'.join(' '.join(word for word in line) for line in removed)
f.seek(0)
f.write(result)
f.truncate()
#Read the two files
with open('vocab30000.txt') as f:
vocabulary = f.readlines()
with open('eng.txt', 'r+') as f:
eng = f.readlines()
vocab_sentences = [i.split(" ") for i in vocabulary]
eng = [i.split(" ") for i in eng]
cleaned_sentences = []
# loop over the sentences and exclude words in eng
for sent in vocab_sentences:
cleaned_sentences.append(" ".join([i for i in sent if i not in eng]))
#write the file
with open('vocab30000.txt', 'w') as f:
f.writelines(cleaned_sentences)
You can try this code. I tried not to use any loops to save your runtime if you have larger files.
import re
with open('eng.txt', 'r') as f:
s = f.read()
s_copy = s
punctuation = [".","\"",",","-","(",")","[","]"]
pattern = re.compile("\\b("+"|".join(punctuation)+")\\W", re.I)
s_copy = pattern.sub(" ", s_copy)
s_copy = s_copy.replace("\"","")
s_words = s_copy.split(" ")
with open('vocab30000.txt', 'r') as f:
check_words = f.read()
remove_words = list(set(s_words) - set(check_words))
pattern = re.compile("\\b("+"|".join(remove_words[1:])+")\\W", re.I)
pattern.sub("", s)
Trying to write function that recursively counts the number of times word appears in a text file in Python.
def word_count(filename, word):
with open('C:/Users/Ibrahim/Desktop/file.txt', 'r') as f:
result_list = [x.split(',') for x in f.readlines()]
for i in result_list:
if i == word:
return word_count(filename,word)
is what I currently have.
I Think it may helpful for you:
import sys, re
def word_count(filename, niddle, splitter=","):
regex_pattern = '|'.join(map(re.escape, splitter))
with open(filename, 'r') as f:
words = [ word for line in f.read().splitlines() for word in re.split(regex_pattern, line)]
words = filter(None, words)
print "Total Words :", len(words)
print "Searching %s in list" % niddle
print "Total Occurance : %d" % words.count(niddle)
def main(argv):
splitter = ","
if len(argv)==3:
filename, word, splitter = argv
elif len(argv)==2:
filename, word = argv
splitter = splitter
else:
print "Usage : word_count.py <file> <word> <splitter>"
sys.exit()
word_count(filename, word, splitter)
if __name__ == "__main__":
main(sys.argv[1:])
Counter is your friend here, i.e.:
from collections import Counter
f = open('yourfile.txt', 'r')
counts = Counter(f.read().split())
print counts
To check if a specific word exists:
if "myword" in counts:
print "exists"
To get a specific word count value, use:
print counts.get('myword')
# or simply
print counts['myword']
Note:
A Counter is simply a dict subclass and supports all the dict operators and methods.
I have written the following code to count the number of sentences, words and characters in the input file sample.txt, which contains a paragraph of text. It works fine in giving the number of sentences and words, but does not give the precise and correct number of characters ( without whitespaces and punctuation marks)
lines,blanklines,sentences,words=0,0,0,0
num_chars=0
print '-'*50
try:
filename = 'sample.txt'
textf = open(filename,'r')c
except IOError:
print 'cannot open file %s for reading' % filename
import sys
sys.exit(0)
for line in textf:
print line
lines += 1
if line.startswith('\n'):
blanklines += 1
else:
sentences += line.count('.')+ line.count ('!')+ line.count('?')
tempwords = line.split(None)
print tempwords
words += len(tempwords)
textf.close()
print '-'*50
print "Lines:", lines
print "blank lines:",blanklines
print "sentences:",sentences
print "words:",words
import nltk
import nltk.data
import nltk.tokenize
with open('sample.txt' , 'r') as f:
for line in f:
num_chars += len(line)
num_chars = num_chars - (words +1 )
pcount = 0
from nltk.tokenize import TreebankWordTokenizer
with open('sample.txt','r') as f1:
for line in f1:
#tokenised_words = nltk.tokenize.word_tokenize(line)
tokenizer = TreebankWordTokenizer()
tokenised_words = tokenizer.tokenize(line)
for w in tokenised_words:
if ((w=='.')|(w==';')|(w=='!')|(w=='?')):
pcount = pcount + 1
print "pcount:",pcount
num_chars = num_chars - pcount
print "chars:",num_chars
pcount is the number of punctuation marks. Can some suggest the changes I need to make in order to find out the exact number of characters without spaces and punctuation marks?
import string
#
# Per-line counting functions
#
def countLines(ln): return 1
def countBlankLines(ln): return 0 if ln.strip() else 1
def countWords(ln): return len(ln.split())
def charCounter(validChars):
vc = set(validChars)
def counter(ln):
return sum(1 for ch in ln if ch in vc)
return counter
countSentences = charCounter('.!?')
countLetters = charCounter(string.letters)
countPunct = charCounter(string.punctuation)
#
# do counting
#
class FileStats(object):
def __init__(self, countFns, labels=None):
super(FileStats,self).__init__()
self.fns = countFns
self.labels = labels if labels else [fn.__name__ for fn in countFns]
self.reset()
def reset(self):
self.counts = [0]*len(self.fns)
def doFile(self, fname):
try:
with open(fname) as inf:
for line in inf:
for i,fn in enumerate(self.fns):
self.counts[i] += fn(line)
except IOError:
print('Could not open file {0} for reading'.format(fname))
def __str__(self):
return '\n'.join('{0:20} {1:>6}'.format(label, count) for label,count in zip(self.labels, self.counts))
fs = FileStats(
(countLines, countBlankLines, countSentences, countWords, countLetters, countPunct),
("Lines", "Blank Lines", "Sentences", "Words", "Letters", "Punctuation")
)
fs.doFile('sample.txt')
print(fs)
results in
Lines 101
Blank Lines 12
Sentences 48
Words 339
Letters 1604
Punctuation 455
You can also use a regex to replace all non-alphanumeric characters and then count the number of characters in each line.
Once thing you could do is when you read the line iterate through it and increment number of characters:
for character in line:
if character.isalnum():
num_chars += 1
P.S. you might want to change if statement condition to satisfy your particular needs, i.e. if you want to count $ for example.
Try this for count number of words and number of sentences and get probability for similar words,
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
text_file = open("..//..//static//output.txt", "r")
lines = text_file.readlines()
x=0
tokenized_words = [word_tokenize(i) for i in lines]
for i in tokenized_words:
print(i) #array contain with tokens
print(str(len(i))) #word count
for j in i:
if j== 'words': #simple algo for count number of 'words' to be count
x = x+1
tokenized_sents = [sent_tokenize(k) for k in lines]
for k in tokenized_sents:
print("Sentences"+str(k)) #array contain with sentences
print("number of sentences "+str(len(k))) #number of sentences
print("number of word"+str(x))
print("Probability of 'word' in text file "+str(x/len(i)))