Creating ARFF from word frequencies

Creating ARFF from word frequencies - python

I have some code that gives me a list of words with their frequencies that they occur in the text, I'm looking to make it so the code converts the top 10 words automatically into an ARFF with
#RELATION wordfrequencies
#ATTRIBUTE word string
#ATTRIBUTE frequency numeric
and the top 10 as data with their frequency.
I'm struggling with how to do this with my current code
import re
import nltk
# Quran subset
filename = 'subsetQuran.txt'
# create list of lower case words
word_list = re.split('\s+', file(filename).read().lower())
print 'Words in text:', len(word_list)
word_list2 = [w.strip() for w in word_list if w.strip() not in nltk.corpus.stopwords.words('english')]
# create dictionary of word:frequency pairs
freq_dic = {}
# punctuation and numbers to be removed
punctuation = re.compile(r'[-.?!,":;()|0-9]')
for word in word_list2:
# remove punctuation marks
word = punctuation.sub("", word)
# form dictionary
try:
freq_dic[word] += 1
except:
freq_dic[word] = 1
print '-'*30
print "sorted by highest frequency first:"
# create list of (val, key) tuple pairs
freq_list2 = [(val, key) for key, val in freq_dic.items()]
# sort by val or frequency
freq_list2.sort(reverse=True)
freq_list3 = list(freq_list2)
# display result
for freq, word in freq_list2:
print word, freq
f = open("wordfreq.txt", "w")
f.write( str(freq_list3) )
f.close()
Any help with this is appreciated, a way of doing this is really racking my brain!

I hope you don't mind the slight rewrite:
import re
import nltk
from collections import defaultdict
# Quran subset
filename = 'subsetQuran.txt'
# create list of lower case words
word_list = open(filename).read().lower().split()
print 'Words in text:', len(word_list)
# remove stopwords
word_list = [w for w in word_list if w not in nltk.corpus.stopwords.words('english')]
# create dictionary of word:frequency pairs
freq_dic = defaultdict(int)
# punctuation and numbers to be removed
punctuation = re.compile(r'[-.?!,":;()|0-9]')
for word in word_list:
# remove punctuation marks
word = punctuation.sub("", word)
# increment count for word
freq_dic[word] += 1
print '-' * 30
print "sorted by highest frequency first:"
# create list of (frequency, word) tuple pairs
freq_list = [(freq, word) for word, freq in freq_dic.items()]
# sort by descending frequency
freq_list.sort(reverse=True)
# display result
for freq, word in freq_list:
print word, freq
# write ARFF file for 10 most common words
f = open("wordfreq.txt", "w")
f.write("#RELATION wordfrequencies\n")
f.write("#ATTRIBUTE word string\n")
f.write("#ATTRIBUTE frequency numeric\n")
f.write("#DATA\n")
for freq, word in freq_list[ : 10]:
f.write("'%s',%d\n" % (word, freq))
f.close()

Related

How can i change this reducer code to find the longest words (and the length) rather than it finding the frequency of the words?

REDUCER CODE This code finds the frequency of the words from a text file, and I would like to know how to change this to find the longest words in the text file and print them out eg. "The longest word has 13 characters. The result includes: "
import sys
results = {}
for line in sys.stdin:
word, frequency = line.strip().split('\t', 1)
results[word]=results.get(word,0) + int(frequency)
words = list(results.keys())
words.sort()
for word in words:
print(word,results[word])
MAPPER CODE
import sys
for line in sys.stdin:
for word in line.strip().split():
print (word , "1")

If you don't want to keep all words then you could do something like this:
longest = set()
max_length = 0
for line in sys.stdin:
for word in line.strip().split():
length = len(word)
if length > max_length:
max_length = length
longest = {word}
elif length == max_length:
longest.add(word)
print(longest)
If you want to keep them, grouped by length, you could use a defaultdict:
from collections import defaultdict
words_length = defaultdict(set)
for line in sys.stdin:
for word in line.strip().split():
words_length[len(word)].add(word)
print(words_length[max(words_length)])

To build on my suggestion (loop through words, keep longest in variable):
longest = ""
for line in something:
for word in line.lower().split():
if len(word.strip()) > len(longest):
longest = word.strip()
print("Longest word is:", longest, "with the length of:", len(longest))

Python count of words by word length

I was given a .txt file with a text. I have already cleaned the text (removed punctuation, uppercase, symbols), and now I have a string with the words.
I am now trying to get the count of characters len() of each item on the string. Then make a plot where N of characters is on the X-axis and the Y-axis is the number of words that have such N len() of characters
So far I have:
text = "sample.txt"
def count_chars(txt):
result = 0
for char in txt:
result += 1 # same as result = result + 1
return result
print(count_chars(text))
So far this is looking for the total len() of the text instead of by word.
I would like to get something like the function Counter Counter() this returns the word with the count of how many times it repeated throughout the text.
from collections import Counter
word_count=Counter(text)
I want to get the # of characters per word. Once we have such a count the plotting should be easier.
Thanks and anything helps!

Okay, first of all you need to open the sample.txt file.
with open('sample.txt', 'r') as text_file:
text = text_file.read()
or
text = open('sample.txt', 'r').read()
Now we can count the words in the text and put it, for example, in a dict.
counter_dict = {}
for word in text.split(" "):
counter_dict[word] = len(word)
print(counter_dict)

It looks like the accepted answer doesn't solve the problem as it was posed by the querent
Then make a plot where N of characters is on the X-axis and the Y-axis is the number of words that have such N len() of characters
import matplotlib.pyplot as plt
# ch10 = ... the text of "Moby Dick"'s chapter 10, as found
# in https://www.gutenberg.org/files/2701/2701-h/2701-h.htm
# split chap10 into a list of words,
words = [w for w in ch10.split() if w]
# some words are joined by an em-dash
words = sum((w.split('—') for w in words), [])
# remove suffixes and one prefix
for suffix in (',','.',':',';','!','?','"'):
words = [w.removesuffix(suffix) for w in words]
words = [w.removeprefix('"') for w in words]
# count the different lenghts using a dict
d = {}
for w in words:
l = len(w)
d[l] = d.get(l, 0) + 1
# retrieve the relevant info from the dict
lenghts, counts = zip(*d.items())
# plot the relevant info
plt.bar(lenghts, counts)
plt.xticks(range(1, max(lenghts)+1))
plt.xlabel('Word lengths')
plt.ylabel('Word counts')
# what is the longest word?
plt.title(' '.join(w for w in words if len(w)==max(lenghts)))
# T H E E N D
plt.show()

Count words (even multiples) in a text with Python

I have to write a function that counts how many times a word (or a series of words) appears in a given text.
This is my function so far. What I noticed is that with a series of 3 words the functions works well, but not with 4 words and so on.
from nltk import ngrams
def function(text, word):
for char in ".?!-":
text = text.replace(char, ' ')
n = len(word.split())
countN = 0
bigram_lower = text.lower()
word_lower = word.lower()
n_grams = ngrams(bigram_lower.split(), n)
for gram in n_grams:
for i in range (0, n):
if gram[i] == word_lower.split()[i]:
countN = countN + 1
print (countN)

First thing, please fix your indentation and don't use bigrams as a variable for ngrams as it's a bit confusing (Since you are not storing just bigrams in the bigrams variable). Secondly lets look at this part of your code -
for gram in bigrams:
for i in range (0, n):
if gram[i] == word_lower.split()[i]:
countN = countN + 1
print (countN)
Here you are increasing countN by one for each time a word in your ngram matches up instead of increasing it when the whole ngram matches up. You should instead only increase countN if all the words have matched up -
for gram in bigrams:
if list(gram) == word_lower.split():
countN = countN + 1
print (countN)

May be it was already done in here
Is nltk mandatory?
# Open the file in read mode
text = open("sample.txt", "r")
# Create an empty dictionary
d = dict()
# Loop through each line of the file
for line in text:
# Remove the leading spaces and newline character
line = line.strip()
# Convert the characters in line to
# lowercase to avoid case mismatch
line = line.lower()
# Split the line into words
words = line.split(" ")
# Iterate over each word in line
for word in words:
# Check if the word is already in dictionary
if word in d:
# Increment count of word by 1
d[word] = d[word] + 1
else:
# Add the word to dictionary with count 1
d[word] = 1
# Print the contents of dictionary
for key in list(d.keys()):
print(key, ":", d[key])

This shuld work for you:
def function(text, word):
for char in ".?!-,":
text = text.replace(char, ' ')
n = len(word.split())
countN = 0
bigram_lower = text.lower()
word_lower = tuple(word.lower().split())
bigrams = nltk.ngrams(bigram_lower.split(), n)
for gram in bigrams:
if gram == word_lower:
countN += 1
print (countN)
>>> tekst="this is the text i want to search, i want to search it for the words i want to search for, and it should count the occurances of the words i want to search for"
>>> function(tekst, "i want to search")
4
>>> function(tekst, "i want to search for")
2

How to create a dictionary for a text file

My program opens a file and it can word count the words contained in it but i want to create a dictionary consisting of all the unique words in the text
for example if the word 'computer' appears three times i want that to count as one unique word
def main():
file = input('Enter the name of the input file: ')
infile = open(file, 'r')
file_contents = infile.read()
infile.close()
words = file_contents.split()
number_of_words = len(words)
print("There are", number_of_words, "words contained in this paragarph")
main()

Use a set. This will only include unique words:
words = set(words)
If you don't care about case, you can do this:
words = set(word.lower() for word in words)
This assumes there is no punctuation. If there is, you will need to strip the punctuation.
import string
words = set(word.lower().strip(string.punctuation) for word in words)
If you need to keep track of how many of each word you have, just replace set with Counter in the examples above:
import string
from collections import Counter
words = Counter(word.lower().strip(string.punctuation) for word in words)
This will give you a dictionary-like object that tells you how many of each word there is.
You can also get the number of unique words from this (although it is slower if that is all you care about):
import string
from collections import Counter
words = Counter(word.lower().strip(string.punctuation) for word in words)
nword = len(words)

#TheBlackCat his solution works but only gives you how much unique words are in the string/file. This solution also shows you how many times it occurs.
dictionaryName = {}
for word in words:
if word not in list(dictionaryName):
dictionaryName[word] = 1
else:
number = dictionaryName.get(word)
dictionaryName[word] = dictionaryName.get(word) + 1
print dictionaryName
tested with:
words = "Foo", "Bar", "Baz", "Baz"
output: {'Foo': 1, 'Bar': 1, 'Baz': 2}

Probably more cleaner and quick solution:
words_dict = {}
for word in words:
word_count = words_dict.get(word, 0)
words_dict[word] = word_count + 1

Word unscrambler

I'm working with some of the corpus materials from NLPP. I'm trying to improve my unscrambling score in the code... at the moment I'm hitting 91.250%.
The point of the exercise is to alter the represent_word function to improve the score.
The function consumes a word a string, and this word is either scrambled or unscrambled. The function produces a "representation" of the word, which is a list containing the following information:
word length
number of vowels
number of consonants
first and last letter of the word (these are always unscrambled)
a tuple of the most commonly used words from the corpus, who's characters are also members of the given word input.
I have also tried analysing anagrams of prefixes and suffixes, but they don't contribute anything to the score in the shadow of the most common words with common characters tuple.
I'm not sure why I can't improve the score. I've even tried increasing dictionary size by importing words from another corpus.
The only section that can be altered here is the represent_word function and the definitions just above it. However, I'm including the entire source incase it might yield some insightful information to someones.
import nltk
import re
def word_counts(corpus, wordcounts = {}):
""" Function that counts all the words in the corpus."""
for word in corpus:
wordcounts.setdefault(word.lower(), 0)
wordcounts[word.lower()] += 1
return wordcounts
JA_list = filter(lambda x: x.isalpha(), map(lambda x:x.lower(),
nltk.corpus.gutenberg.words('austen-persuasion.txt')))
JA_freqdist=nltk.FreqDist(JA_list)
JA_toplist=sorted(JA_freqdist.items(),key=lambda x: x[1], reverse=True)[:0]
JA_topwords=[]
for i in JA_toplist:
JA_topwords.append(i[0])
PP_list = filter(lambda x: x.isalpha(),map(lambda x:x.lower(),
open("Pride and Prejudice.txt").read().split()))
PP_freqdist=nltk.FreqDist(PP_list)
PP_toplist=sorted(PP_freqdist.items(),key=lambda x: x[1], reverse=True)[:7]
PP_topwords=[]
for i in PP_toplist:
PP_topwords.append(i[0])
uniquewords=[]
for i in JA_topwords:
if i not in PP_topwords:
uniquewords.append(i)
else:
continue
uniquewords.extend(PP_topwords)
def represent_word(word):
def common_word(word):
dictionary= uniquewords
findings=[]
for string in dictionary:
if all((letter in word) for letter in string):
findings.append(string)
else:
False
if not findings:
return None
else:
return tuple(findings)
vowels = list("aeiouy")
consonants = list("bcdfghjklmnpqrstvexz")
number_of_consonants = sum(word.count(i) for i in consonants)
number_of_vowels = sum(word.count(i) for i in vowels)
split_word=list(word)
common_words=common_word(word)
return tuple([split_word[0],split_word[-1], len(split_word),number_of_consonants, number_of_vowels, common_words])
def create_mapping(words, mapping = {}):
""" Returns a mapping of representations of words to the most common word for that representation. """
for word in words:
representation = represent_word(word)
mapping.setdefault(representation, ("", 0))
if mapping[representation][1] < words[word]:
mapping[representation] = (word, words[word])
return mapping
if __name__ == '__main__':
# Create a mapping of representations of the words in Persuasian by Jane Austen to use as a corpus
words = JA_freqdist
mapping = create_mapping(words)
# Load the words in the scrambled file
with open("Pdrie and Puicejdre.txt") as scrambled_file:
scrambled_lines = [line.split() for line in scrambled_file if len(line.strip()) > 0 ]
scrambled_words = [word.lower() for line in scrambled_lines for word in line]
# Descramble the words using the best mapping
descrambled_words = []
for scrambled_word in scrambled_words:
representation = represent_word(scrambled_word)
if representation in mapping:
descrambled_word = mapping[representation][0]
else:
descrambled_word = scrambled_word
descrambled_words.append(descrambled_word)
# Load the original words
with open("Pride and Prejudice.txt") as original_file:
original_lines = [line.split() for line in original_file if len(line.strip()) > 0 ]
original_words = [word.lower() for line in original_lines for word in line]
# Make a list of word pairs from descrambled_words and original words
word_pairs = zip(descrambled_words, original_words)
# See if the words are the same
judgements = [descrambled_word == original_word for (descrambled_word, original_word) in word_pairs]
# Print the results
print "Correct: {0:.3%}".format(float(judgements.count(True))/len(judgements))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Creating ARFF from word frequencies - python

Related

How can i change this reducer code to find the longest words (and the length) rather than it finding the frequency of the words?

Python count of words by word length

Count words (even multiples) in a text with Python

How to create a dictionary for a text file

Word unscrambler

Categories

Resources