prints line number in both txtfile and list? - python

i have this code which prints the line number in infile but also the linenumber in words what do i do to only print the line number of the txt file next to the words???
d = {}
counter = 0
wrongwords = []
for line in infile:
infile = line.split()
wrongwords.extend(infile)
counter += 1
for word in infile:
if word not in d:
d[word] = [counter]
if word in d:
d[word].append(counter)
for stuff in wrongwords:
print(stuff, d[stuff])
the output is :
hello [1, 2, 7, 9] # this is printing the linenumber of the txt file
hello [1] # this is printing the linenumber of the list words
hello [1]
what i want is:
hello [1, 2, 7, 9]

Four things:
You can keep track of the line number by doing this instead of handling a
counter on your own:
for line_no, word in enumerate(infile):
As sateesh pointed out above, you probably need an else in your
conditions:
if word not in d:
d[word] = [counter]
else:
d[word].append(counter)
Also note that the above code snippet is exactly what defaultdicts are
for:
from collections import defaultdict
d = defaultdict(list)
Then in your main loop, you can get rid of the if..else part:
d[word].append(counter)
Why are you doing wrongwords.extend(infile)?
Also, I don't really understand how you are supposed to decide what "wrong words" are. I assume that you have a set named wrongwords that contains the wrong words, which makes your final code something like this:
from collections import defaultdict
d = defaultdict(list)
wrongwords = set(["hello", "foo", "bar", "baz"])
for counter, line in enumerate(infile):
infile = line.split()
for word in infile:
if word in wrongwords:
d[word].append(counter)

Related

Get duplicated lines in file

Working on file with thousands of line
trying to find which line is duplicated exactly ( 2 times )
from collections import Counter
with open('log.txt') as f:
string = f.readlines()
c = Counter(string)
print c
it give me the result of all duplicated lines but i need to get the repeated line (2 times only)
You're printing all the strings and not just the repeated ones, to print only the ones which are repeated twice, you can print the strings which have a count of two.
from collections import Counter
with open('log.txt') as f:
string = f.readlines()
c = Counter(string)
for line, count in c.items():
if count==2:
print(line)
The Counter Object also provides information about how often a line occurs.
You can filter it using e.g. list comprehension.
This will print all lines, that occur exactly two times in the file
with open('log.txt') as f:
string = f.readlines()
print([k for k,v in Counter(string).items() if v == 2])
If you want to have all repeated lines (lines duplicated two or more times)
with open('log.txt') as f:
string = f.readlines()
print([k for k,v in Counter(string).items() if v > 1])
You could use Counter.most_common i.e.
from collections import Counter
with open('log.txt') as f:
c = Counter(f)
print(c.most_common(1))
This prints the Counter entry with the highest count.

Count the occurrence of specific words on a text file and print the 50 most occurred ones among them

I want to count the occurrence of specific keywords (stored in .txt file, one word each line) in a text file.And print the 50 most occurred ones. Here's what I did :
from collections import Counter
with open("./Text_file.txt", "r", encoding='utf8') as logfile:
word_counts = Counter(logfile.read().split())
with open("./key_words.txt", "r", encoding='utf8') as word:
lines = word.readlines()
for line in lines:
count = [word_counts.get('line')]
lst = sorted (count)
print (lst[:50])
I returns this to me, which doesn't mean anything :
[20]
Any help ?
One Option
from collections import Counter
# Read keywords
with open("./key_words.txt", "r", encoding='utf8') as keyfile:
# Use set of keywords (#MisterMiyagi comment)
keywords = set(keyfile.read().split('\n'))
# Process words
with open("./Text_file.txt", "r", encoding='utf8') as logfile:
cnts = Counter()
for line in logfile:
if line:
line = line.rstrip()
# only count keywords
cnts.update(word for word in line.split() if word in keywords)
# Use counter most_common to get most popular 50
print(cnts.most_common(50))
Alternative Using Counter+Regex
Regex used to separate words from punctuation i.e. perids, quotes, commas, etc.
import re
from collections import Counter
with open("./key_words.txt", "r", encoding='utf8') as keyfile:
keywords = keyfile.read().lower().split('\n')
with open("./Text_file.txt", "r", encoding='utf8') as logfile:
cnts = Counter()
for line in logfile:
# use regex to separate words from punctuation
# lowercase words
words = map(lambda x:x.lower(), re.findall('[a-zA-Z]+', line, flags=re.A))
cnts.update(word for word in words if word in keywords)
print(cnts.most_common(50))
Here is what you can do:
from collections import Counter
with open("./Text_file.txt", "r") as file,open("./key_words.txt", "r") as word:
words1 = [w.strip() for w in file.read().split()] # Strore words from text file into list
words2 = [w.strip() for w in word.read().split()] # Strore words from key file into list
s = [w1 for w1 in words1 if w1 in words2] # List all words from text file that are in key file
d = Counter(s) # Diction that stores each word from s with the amount of times the word occurs in s
lst = [w for k,w in sorted([(v,k) for k,v in d.items()],reverse=True)[:50]]
print(lst)
On here word_counts.get('line'), you are calling for just the occurances of line on every iteration, which is why your result list has single value. Following is modified code of yours for top 50 words from keywords.
from collections import Counter
with open("./Text_file.txt", "r", encoding='utf8') as logfile:
word_counts = Counter(logfile.read().split())
wc = dict(word_counts)
kwc = {} #keyword counter
with open("./key_words.txt", "r", encoding='utf8') as word:
lines = word.readlines()
for line in lines:
line = line.strip() #assuming each word is in separate line, removes '\n' character from end of line
if line in wc.keys():
kwc.update({line:wc[line]}) # if keyword is found, adds that to kwc
lst = sorted (kwc, key = kwc.get, reverse = True) #sorts in decreasing order on value of dict
print (lst[:50])
I modified your code - you were close but needed to fix a few things:
You were only storing a single count, not building a list of words. I solved this by making a new dict of words-to-counts, but only for the found keywords.
As others have said, you were using the string literal 'line' instead of line
You weren't stripping the newline from each line - when you use readlines() the \n newline is at the end of each line, so none of your words were being found in your Counter.
So, here's the code. It prints out the keywords in descending order of counts, and just the 1st 50:
from collections import Counter
with open("./Text_file.txt", "r", encoding='utf8') as logfile:
word_counts = Counter(logfile.read().split())
found_keywords = {}
with open("./key_words.txt", "r", encoding='utf8') as word:
lines = word.readlines()
for line in lines:
line = line.rstrip()
count = word_counts[line]
if count > 0:
found_keywords[line] = count
>>> print([(k, v) for k, v in sorted(found_keywords.items(), key=lambda item: item[1], reverse=True)][:50])
[('cat', 3), ('dog', 1)]

Python: Creating a function counting specific words in a textfile

I want to create a function that returns the value of word count of a specific word in a text file.
Here's what I currently have:
def Word_Counter(Text_File, Word):
Data = open(Text_File, 'r').read().lower()
count = Data.count(Word)
print(Word, "; ", count)
Word_Counter('Example.txt', "the")
Which returns: "the ; 35"
That is pretty much what I want it to do. But what if I want to test a text for a range of words. I want the words (key) and values in say a list or dictionary. What's a way of doing that without using modules?
Say if I tested the function with this list of words: [time, when, left, I, do, an, who, what, sometimes].
The results I would like would be something like:
Word Counts = {'time': 1, 'when': 4, 'left': 0, 'I': 5, 'do': 2, 'an': 0, 'who': 1, 'what': 3, 'sometimes': 1}
I have been able to create a dictionary which does a word count for every word, like example below.
wordfreq = {}
for word in words.replace(',', ' ').split():
wordfreq[word] = wordfreq.setdefault(word, 0) + 1
I'd like to do a similar style but only targeting specific words, any suggestions?
From your given code, I did not test this.
def Word_Counter(Text_File, word_list):
Data = open(Text_File, 'r').read().lower()
output = {}
for word in word_list:
output[word] = Data.count(Word)
Or you can do this
text = open("sample.txt", "r")
# Create an empty dictionary
d = dict()
# Loop through each line of the file
for line in text:
# Remove the leading spaces and newline character
line = line.strip()
# Convert the characters in line to
# lowercase to avoid case mismatch
line = line.lower()
# Split the line into words
words = line.split(" ")
# Iterate over each word in line
for word in words:
# Check if the word is already in dictionary
if word in d:
# Increment count of word by 1
d[word] = d[word] + 1
else:
# Add the word to dictionary with count 1
d[word] = 1
UPDATE
Try the following:
keywords = ['the', 'that']
worddict = {}
with open('out.txt', 'r') as f:
text = f.read().split(' ') # or f.read().split(',')
for word in text:
worddict[word] = worddict[word]+1 if word in worddict else 1
print([{x, worddict[x]} for x in keywords])

Create dictionary from a list deleting all spaces

sorry for asking but I'm kind of new to these things. I'm doing a splitting words from the text and putting them to dict creating an index for each token:
import re
f = open('/Users/Half_Pint_Boy/Desktop/sentenses.txt', 'r')
a=0
c=0
e=[]
for line in f:
b=re.split('[^a-z]', line.lower())
a+=len(list(filter(None, b)))
c = c + 1
e = e + b
d = dict(zip(e, range(len(e))))
But in the end I receive a dict with spaces in it like that:
{'': 633,
'a': 617,
'according': 385,
'adjacent': 237,
'allow': 429,
'allows': 459}
How can I remove "" from the final result in dict? Also how can I change the indexing after that to not use "" in index counting? (with "" the index count is 633, without-248)
Big thanks!
How about this?
b = list(filter(None, re.split('[^a-z]', line.lower())))
As an alternative:
b = re.findall('[a-z]+', line.lower())
Either way, you can then also remove that filter from the next line:
a += len(b)
EDIT
As an aside, I think what you end up with here is a dictionary mapping words to the last position in which they appear in the text. I'm not sure if that's what you intended to do. E.g.
>>> dict(zip(['hello', 'world', 'hello', 'again'], range(4)))
{'world': 1, 'hello': 2, 'again': 3}
If you instead want to keep track of all the positions a word occurs, perhaps try this code instead:
from collections import defaultdict
import re
indexes = defaultdict(list)
with open('test.txt', 'r') as f:
for index, word in enumerate(re.findall(r'[a-z]+', f.read().lower())):
indexes[word].append(index)
indexes then maps each word to a list of indexes at which the word appears.
EDIT 2
Based on the comment discussion below, I think you want something more like this:
from collections import defaultdict
import re
word_positions = {}
with open('test.txt', 'r') as f:
index = 0
for word in re.findall(r'[a-z]+', f.read().lower()):
if word not in word_positions:
word_positions[word] = index
index += 1
print(word_positions)
# Output:
# {'hello': 0, 'goodbye': 2, 'world': 1}
Your regex looks not a good one. Consider to use:
line = re.sub('[^a-z]*$', '', line.strip())
b = re.split('[^a-z]+', line.lower())
Replace:
d = dict(zip(e, range(len(e))))
With:
d = {word:n for n, word in enumerate(e) if word}
Alternatively, to avoid the empty entries in the first place, replace:
b=re.split('[^a-z]', line.lower())
With:
b=re.split('[^a-z]+', re.sub('(^[^a-z]+|[^a-z]+$)', '', line.lower()))

Python - Dictionary of sets with the same key

I'm trying to create a function that will read a text file that has one word on each line, like
afd
asmv
adsasd
It will take words of the user given length and will construct a python dictionary where the key is a string of the word where the letters are sorted. The values will be a set of all words that have the same key. So far I have:
def setdict():
wordfile = argv[1]
open(wordfile, "r")
setdict = {}
for line in wordfile:
words = line.split()
for word in words:
word = word.rstrip("\n")
if word == wordlength:
key = str(sorted(word))
I'm a little lost on how to create the sets with words that have the same key and put them in the dictionary. Any help would be appreciated.
collections.defaultdict is useful here:
from collections import defaultdict
from pprint import pprint
words = defaultdict(set)
with open('input.txt') as input_file:
for line in input_file:
for word in line.split():
sorted_list = sorted(word)
sorted_str = ''.join(sorted_list)
words[sorted_str].add(word)
pprint(words)
Of course, anything you can do with defaultdict, you can also do with dict.setdefault():
words = dict()
with open('input.txt') as input_file:
for line in input_file:
for word in line.split():
sorted_list = sorted(word)
sorted_str = ''.join(sorted_list)
words.setdefault(sorted_str, set()).add(word)
start with something simple
words = ["hello","python","world"]
my_dict = {}
for word in words:
try:
my_dict[sorted(word)].append(word)
except KeyError:
my_dict[sorted(word)] = [word]
now instead of using predefined words read them from a file
words = map(str.split,open("some_word_file.txt"))
the key here is to access the dictionary with a for loop that makes the value set available for manipulation. you can solve your problem by reading the file linewise (readline) and checking the following:
for key, value in my_dict:
if sorted(word) == key:
value.append(word)
else:
my_dict[sorted(word)] = value

Categories

Resources