appending specific words to list from file in python - python

I am writing a program that reads from a file of 50,000 words and it needs to get the percentage of words that do not have the letter 'e' in them. I can get the program to print all the words without e's but I want to append them to a list so that I can get the sum of the elements within the list. What I have now gives me the result of 0 every time I run it. It also produces the total amount of lines which is correct. Sorry, I am not the best in python.
f=open("hardwords.txt")
def has_no_e(f):
words = []
sum_words= len(words)
total = sum(1 for s in f)
print total
print sum_words
letter = 'e'
for line in f:
for l in letter:
if l in line:
break
else:
words.append(line)
has_no_e(f)

You don't need to collect the words, just count them.
Untested:
total = 0
without_e = 0
with open("hardwords.txt") as f:
for line in f:
total = total + 1
if not 'e' in line:
without_e = without_e + 1
percentage = float(without_e) / float(total)

What about this:
def has_no_e():
with open(path, "r") as f:
words = [word.strip() for line in f.readlines() for word in line.strip().split(',')]
words_without_e = [word for word in words if 'e' not in word]
print len(words), words
print len(words_without_e), words_without_e
has_no_e()
Now you just need to calculate the percentage

This does just so:
def has_no_e(path):
total_words = 0
words_without_e = 0
with open(path, "r") as f:
for line in f:
words = line.lower().split()
total_words += len(words)
words_without_e += sum("e" not in w for w in words)
return (float(words_without_e)/total_words)*100

This a possible way to do it:
with open('G:\Tmp\demo.txt', 'r') as f:
total = 0
count = 0
for line in f:
words = line.split()
total = total + len(words)
count = count + len([w for w in words if w.find('e') > 0])
print 'Total word:{0}, counted:{1}'.format(total, count)

Related

Programming a sum function in python?

how to sum numbers attached to words in a text file(not separate them into digits) in python? (example: "a23 B55" - answer = 78)
thats what i did but its not quite right:
def rixum(file_name):
f = open(file_name,'r')
line = f.readline()
temp = line.split()
res = []
for word in temp:
i = 0
while i < len(word)-1:
if word[i].isdigit():
res.append(int(word[i:]))
print(sum(res))
f.close()
return sum(res)
This worked for me:
import re
string = 'F43 n90 i625'
def summ_numbers(string):
return sum([int(num) for num in re.findall('\d+', string)])
print(summ_numbers(string))
Output:
758
You don't really need to build a list - you can simply accumulate the values as you go along (line by line):
def rixum(filename):
with open(filename) as data:
for line in data:
total = 0
for token in line.split():
for i, c in enumerate(token):
if c.isdigit():
total += int(token[i:])
break
print(total)

Splitting synset and removing duplicate words

Score SynsetTerms
1 soft#18 mild#3 balmy#2
1 love#2 enjoy#3
0.625 love#1
From the input above, how can i achieve the output as below? I wish to create a file that has the word and their score by removing duplicate words, and splitting each word into a new row. Duplicated words with higher score will be selected instead.
Score SynsetTerms
1 soft
1 mild
1 balmy
1 enjoy
1 love
note that the word 'love' with 0.625 score was removed, only the 'love' with score 1 is kept as it has higher score.
import re
lst = []
dict = {}
i = 1
fhand = open('C:/Users/10648879/Documents/python_prog/data/test.csv', 'r')
for line in fhand:
if i == 1:
i = i + 1
continue
line = re.sub('#[0-9]*', '', line).strip()
line = re.split('\s+', line)
for counter in range(len(line)):
if counter == 0:
score = line[counter]
continue
if line[counter] in dict:
if score > dict[line[counter]]:
dict[line[counter]] = score
else :
dict[line[counter]] = score
i = i + 1
print 'score' + ' ' + 'SynsetTerms'
for k, v in dict.items():
print v, k

Trying to output the x most common words in a text file

I'm trying to write a program that will read in a text file and output a list of most common words (30 as the code is written now) along with their counts. so something like:
word1 count1
word2 count2
word3 count3
... ...
... ...
wordn countn
in order of count1 > count2 > count3 >... >countn. This is what I have so far but I cannot get the sorted function to perform what I want. The error I get now is:
TypeError: list indices must be integers, not tuple
I'm new to python. Any help would be appreciated. Thank you.
def count_func(dictionary_list):
return dictionary_list[1]
def print_top(filename):
word_list = {}
with open(filename, 'r') as input_file:
count = 0
#best
for line in input_file:
for word in line.split():
word = word.lower()
if word not in word_list:
word_list[word] = 1
else:
word_list[word] += 1
#sorted_x = sorted(word_list.items(), key=operator.itemgetter(1))
# items = sorted(word_count.items(), key=get_count, reverse=True)
word_list = sorted(word_list.items(), key=lambda x: x[1])
for word in word_list:
if (count > 30):#19
break
print "%s: %s" % (word, word_list[word])
count += 1
# This basic command line argument parsing code is provided and
# calls the print_words() and print_top() functions which you must define.
def main():
if len(sys.argv) != 3:
print 'usage: ./wordcount.py {--count | --topcount} file'
sys.exit(1)
option = sys.argv[1]
filename = sys.argv[2]
if option == '--count':
print_words(filename)
elif option == '--topcount':
print_top(filename)
else:
print 'unknown option: ' + option
sys.exit(1)
if __name__ == '__main__':
main()
Use the collections.Counter class.
from collections import Counter
for word, count in Counter(words).most_common(30):
print(word, count)
Some unsolicited advice: Don't make so many functions until everything is working as one big block of code. Refactor into functions after it works. You don't even need a main section for a script this small.
Using itertools' groupby:
from itertools import groupby
words = sorted([w.lower() for w in open("/path/to/file").read().split()])
count = [[item[0], len(list(item[1]))] for item in groupby(words)]
count.sort(key=lambda x: x[1], reverse = True)
for item in count[:5]:
print(*item)
This will list the file's words, sort them and list unique words and their occurrence. Subsequently, the found list is sorted by occurrence by:
count.sort(key=lambda x: x[1], reverse = True)
The reverse = True is to list the most common words first.
In the line:
for item in count[:5]:
[:5] defines the number of most occurring words to show.
First method as others have suggested i.e. by using most_common(...) doesn't work according to your needs cause it returns the nth first most common words and not the words whose count is less than or equal to n:
Here's using most_common(...): note it just print the first nth most common words:
>>> import re
... from collections import Counter
... def print_top(filename, max_count):
... words = re.findall(r'\w+', open(filename).read().lower())
... for word, count in Counter(words).most_common(max_count):
... print word, count
... print_top('n.sh', 1)
force 1
The correct way would be as follows, note it prints all the words whose count is less than equal to count:
>>> import re
... from collections import Counter
... def print_top(filename, max_count):
... words = re.findall(r'\w+', open(filename).read().lower())
... for word, count in filter(lambda x: x[1]<=max_count, sorted(Counter(words).items(), key=lambda x: x[1], reverse=True)):
... print word, count
... print_top('n.sh', 1)
force 1
in 1
done 1
mysql 1
yes 1
egrep 1
for 1
1 1
print 1
bin 1
do 1
awk 1
reinstall 1
bash 1
mythtv 1
selections 1
install 1
v 1
y 1
Here is my python3 solution. I was asked this question in an interview and the interviewer was happy this solution, albeit in a less time-constrained situation the other solutions provided above seem a lot nicer to me.
dict_count = {}
lines = []
file = open("logdata.txt", "r")
for line in file:# open("logdata.txt", "r"):
lines.append(line.replace('\n', ''))
for line in lines:
if line not in dict_count:
dict_count[line] = 1
else:
num = dict_count[line]
dict_count[line] = (num + 1)
def greatest(words):
greatest = 0
string = ''
for key, val in words.items():
if val > greatest:
greatest = val
string = key
return [greatest, string]
most_common = []
def n_most_common_words(n, words):
while len(most_common) < n:
most_common.append(greatest(words))
del words[(greatest(words)[1])]
n_most_common_words(20, dict_count)
print(most_common)

Count consecutive occurrences of values in a .txt file

I have a .txt file that has two words repeating in separate lines.
Here is an example. (the actual one is about 80,000 lines long)
ANS
ANS
ANS
AUT
AUT
AUT
AUT
ANS
ANS
ANS
ANS
ANS
I am trying to develop some Python code to count the consecutive lines and return the number of times they repeat. So for this example I would like to return [3,4,5] to another .txt file
word="100011010"
count=1
length=""
for i in range(1, len(word)):
if word[i-1] == word[i]:
count += 1
else:
length += word[i-1]+" repeats "+str(count)+", "
count=1
length += ("and "+word[i]+" repeats "+str(count))
print (length)
The concept is similar to the above code for a string. Is there a way to do this with a list?
You can read the entire file as this:
content = []
with open('/path/to/file.txt', 'r') as file
content = file.readlines()
#Maybe you want to strip the lines
#content = [line.strip() for line in file.readlines()]
Here you have a list with all the lines of the file
def count_consecutive_lines(lines):
counter = 1
output = ''
for index in range(1, len(lines)):
if lines[index] != lines[index-1]:
output += '{} repeats {} times.\n'.format(lines[index], counter)
counter = 1
counter += 1
return output
And call this like
print(count_consecutive_lines(content))
An answer that doesn't load the whole file into memory:
last = None
count = 0
result = []
with open('sample.txt', 'rb') as f:
for line in f:
line = line.strip()
if line == last:
count = count + 1
else:
if count > 0:
result.append(count)
count = 1
last = line
result.append(count)
print result
Result:
[3, 4, 5]
UPDATE
The list contains integers, you can only join strings, so you will have to convert it.
outFile.write('\n'.join(str(n) for n in result))
You can try to convert the file data into a list and follow the approach given below:
with open("./sample.txt", 'r') as fl:
fl_list = list(fl)
unique_data = set(fl_list)
for unique in unique_data:
print "%s - count: %s" %(unique, fl_list.count(unique))
#output:
ANS - count: 8
AUT - count: 4
Open your file and read it to count:
l=[]
last=''
with open('data.txt', 'r') as f:
data = f.readlines()
for line in data:
words = line.split()
if words[0]==last:
l[-1]=l[-1]+1
last=words[0]
else:
l.append(1)
if last=='':
last=words[0]
Here is your expected output :)
with open("./sample.txt", 'r') as fl:
word = list(fl)
count=1
length=[]
for i in range(1, len(word)):
if word[i-1] == word[i]:
count += 1
else:
length.append(count)
count=1
length.append(count)
print (length)
#output as you excpect:
[3, 4, 5]

Count lines after line with specific character

I have a file which contains this data:
>P136
FCF#0.73
FCF#0.66
FCF#0.86
>P129
FCF#0.72
>P142
>P144
>P134
FCF#0.70
FCF#0.82
And I need to count the number of lines after a line containing ">" , but keeping the ">" line as reference, for this example the output should be:
>P136 3
>P129 1
>P134 2
Any ideas?
Use dictionary to store the count per line, and every time there is no > at the start, increment the count:
counts = {}
current = None
with open(filename) as fo:
for line in fo:
if line.startswith('>'):
current = line.strip()
counts[current] = 0
else:
counts[current] += 1
then simply loop and print the counts:
for entry, count in counts.items():
print('{} {:2d}'.format(entry, count))
You could even just print the number every time you find a new section:
count = 0
current = None
with open(filename) as fo:
for line in fo:
if line.startswith('>'):
if current and count:
print('{} {:2d}'.format(entry, count))
current = line.strip()
counts = 0
else:
count += 1
if current and count:
print('{} {:2d}'.format(entry, count))
but you cannot then easily re-purpose the counts for other work.
In one line, just to show that we can:
s=""">P136
FCF#0.73
FCF#0.66
FCF#0.86
>P129
FCF#0.72
>P142
>P144
>P134
FCF#0.70
FCF#0.82
"""
First variant:
print [(i.split("\n")[0],len(i.split("\n")[1:])-1) for i in s.split(">")if i if len(i.split("\n")[1:])-1>0]
using re:
import re
print [ (block.split("\n")[0],sum(1 for m in re.finditer("#", block)))for block in s.split(">")]
This is a simple solution that attempts to be minimalistic.
with open(filename) as f:
def printcc(current, count):
if current is not None and count > 0:
print(current.strip(), count)
current = None
count = 0
for line in f:
if line[0] == '>':
printcc(current, count)
current = line
count = 0
else:
count += 1
printcc(current, count)
In case you actually want all lines that contain a > character, use '>' in line as your condition. If you're targeting Python 2.x, use print current.strip(), count because having the outer parentheses will print a two-tuple.

Categories

Resources