Reading specific words from an online source - Python - python

Here's the content of the text file abc.txt
This is before the start and should be ignored.
So should this
and this
*** START OF SYNTHETIC TEST CASE ***
a ba bac
*** END OF SYNTHETIC TEST CASE ***
This is after the end and should be ignored too.
Have a nice day.
I need to write a function, get_words_from_file(filename), that returns a list of lower case words as shown in the sample case below. The function should only process lines between the start and end marker lines and use the definition of words provided below.
I am provided with the following regular expression that describes what is required. I am not expected to understand how regular expressions work, I just need to understand that the call to findall given below will return a list of the relevant words from a given line string.
words_on_line = re.findall("[a-z]+[-'][a-z]+|[a-z]+[']?|[a-z]+", line)
.Include all lower-case character sequences including those that contain a
- or ' character and those that end with a ' character.
.Words that end with a - MUST NOT be included.
.The words should be in the same order as they occur in the file.
.There must be no more than 9 CONSTANTS declared.
.Functions must be no longer than 20 statements.
.Functions must not have more than 3 parameters.
Test Code:
filename = "abc.txt"
words2 = get_words_from_file(filename)
print(filename, "loaded ok.")
print("{} valid words found.".format(len(words2)))
print("Valid word list:")
print("\n".join(words2))
Expected Output:
abc.txt loaded ok.
3 valid words found.
Valid word list:
a
ba
bac
My Code is as follows:
def stripped_lines(lines):
for line in lines:
stripped_line = line.rstrip('\n')
yield stripped_line
def lines_from_file(fname):
with open(fname, 'rt', encoding='utf8') as flines:
for line in stripped_lines(flines):
yield line
def is_marker_line(line, start='***', end='***'):
'''
Marker lines start and end with the given strings, which may not
overlap. (A line containing just '***' is not a valid marker line.)
'''
min_len = len(start) + len(end)
if len(line) < min_len:
return False
return line.startswith(start) and line.endswith(end)
def advance_past_next_marker(lines):
'''
Advances the given iterator through the first encountered marker
line, if any.
'''
for line in lines:
if is_marker_line(line):
break
def lines_before_next_marker(lines):
'''
Yields all lines up to but not including the next marker line. If
no marker line is found, yields no lines.
'''
valid_lines = []
for line in lines:
if is_marker_line(line):
break
valid_lines.append(line)
else:
# `for` loop did not break, meaning there was no marker line.
valid_lines = []
for content_line in valid_lines:
yield content_line
def lines_between_markers(lines):
'''
Yields the lines between the first two marker lines.
'''
# Must use the iterator --- if it's merely an iterable (like a list
# of strings), the call to lines_before_next_marker will restart
# from the beginning.
it = iter(lines)
advance_past_next_marker(it)
for line in lines_before_next_marker(it):
yield line
def words(lines):
text = '\n'.join(lines).lower().split()
# Same as before...
def get_words_from_file(fname):
for word in words(lines_between_markers(lines_from_file(fname))):
return word
filename = "abc.txt"
words2 = get_words_from_file(filename)
print(filename, "loaded ok.")
print("{} valid words found.".format(len(words2)))
print("Valid word list:")
print("\n".join(words2))
My Crappy Output
Traceback (most recent call last):
File "C:/Users/Jill/SQ4.1(2).py", line 67, in <module>
words2 = get_words_from_file(filename)
File "C:/Users/Jason/SQ4.1(2).py", line 63, in <module>
for word in words(lines_between_markers(lines_from_file(fname))):
builtins.TypeError: 'NoneType' object is not iterable
Could you help me with the correcting my code? I am at a total loss.

I have changed the original code a bit, try below.
def stripped_lines(lines):
for line in lines:
stripped_line = line.rstrip('\n')
yield stripped_line
def lines_from_file(fname):
with open(fname, 'rt') as flines:
for line in stripped_lines(flines):
yield line
def is_marker_line(line, start='***', end='***'):
'''
Marker lines start and end with the given strings, which may not
overlap. (A line containing just '***' is not a valid marker line.)
'''
min_len = len(start) + len(end)
if len(line) < min_len:
return False
return line.startswith(start) and line.endswith(end)
def advance_past_next_marker(lines):
'''
Advances the given iterator through the first encountered marker
line, if any.
'''
for line in lines:
if is_marker_line(line):
break
def lines_before_next_marker(lines):
'''
Yields all lines up to but not including the next marker line. If
no marker line is found, yields no lines.
'''
valid_lines = []
for line in lines:
if is_marker_line(line):
break
valid_lines.append(line)
else:
# `for` loop did not break, meaning there was no marker line.
valid_lines = []
for content_line in valid_lines:
yield content_line
def lines_between_markers(lines):
'''
Yields the lines between the first two marker lines.
'''
# Must use the iterator --- if it's merely an iterable (like a list
# of strings), the call to lines_before_next_marker will restart
# from the beginning.
it = iter(lines)
advance_past_next_marker(it)
for line in lines_before_next_marker(it):
yield line
def words(lines):
text = '\n'.join(lines).lower().split()
return text
def get_words_from_file(fname):
return words(lines_between_markers(lines_from_file(fname)))
filename = "abc.txt"
all_words = get_words_from_file(filename)
print(filename, "loaded ok.")
print("{} valid words found.".format(len(all_words)))
print("Valid word list:")
print("\n".join(all_words))
Output will be below,
('abc.txt', 'loaded ok.')
3 valid words found.
Valid word list:
a
ba
bac

Related

How can I go to the next line in .txt file?

How can I read only first symbol in each line with out reading all line, using python?
For example, if I have file like:
apple
pear
watermelon
In each iteration I must store only one (the first) letter of line.
Result of program should be ["a","p","w"], I tried to use file.seek(), but how can I move it to the new line?
ti7 answer is great, but if the lines might be too long to save in memory, you might wish to read char-by-char to prevent storing the whole line in memory:
from pathlib import Path
from typing import Iterator
NEWLINE_CHAR = {'\n', '\r'}
def first_chars(file_path: Path) -> Iterator[str]:
with open(file_path) as fh:
new_line = True
while c := fh.read(1):
if c in NEWLINE_CHAR:
new_line = True
elif new_line:
yield c
new_line = False
Test:
path = Path('/some/path/a.py')
easy_first_chars = [l[0] for l in path.read_text().splitlines() if l]
smart_first_chars = list(first_chars(path))
assert smart_first_chars == easy_first_chars
file-like objects are iterable, so you can directly use them like this
collection = []
with open("input.txt") as fh:
for line in fh: # iterate by-lines over file-like
try:
collection.append(line[0]) # get the first char in the line
except IndexError: # line has no chars
pass # consider other handling
# work with collection
You may also consider enumerate() if you cared about which line a particular value was on, or yielding line[0] to form a generator (which may allow a more efficient process if it can halt before reading the entire file)
def my_generator():
with open("input.txt") as fh:
for lineno, line in enumerate(fh, 1): # lines are commonly 1-indexed
try:
yield lineno, line[0] # first char in the line
except IndexError: # line has no chars
pass # consider other handling
for lineno, first_letter in my_generator():
# work with lineno and first_letter here and break when done
You can read one letter with file.read(1)
file = open(filepath, "r")
letters = []
# Initilalized to '\n' to sotre first letter
previous = '\n'
while True:
# Read only one letter
letter = file.read(1)
if letter == '':
break
elif previous == '\n':
# Store next letter after a next line '\n'
letters.append(letter)
previous = letter

How to read text files in python with specified condition?

I have a text file and I want to extract the number of the line that contains certain phrases (ATOMIC_POSITIONS (angstrom) and K_POINTS (automatic)).
n = -1
with open(filename) as f:
for line in f:
n += 1
if line == "ATOMIC_POSITIONS (angstrom)":
print('test1')
start = n
elif line == "K_POINTS (automatic)":
print('test2')
end = n
print(start, end)
My problem is that python does not go inside the if statements (i.e. test1 and test2 are not printed).
But I am sure that filename contain the phrases, this is small part of filename:
0.000000613 0.000000613 1.022009120
ATOMIC_POSITIONS (angstrom)
C 1.696797551 1.714436737 -0.068349117
Simply put: your condition is not met. "==" checks for equality, which for several reasons may not be true in your case (see comments).
When checking for a string in a line of a file I would try this:
n=-1
with open(filename) as f:
for line in f:
n += 1
if "ATOMIC_POSITIONS (angstrom)" in line:
print('test1')
start = n

Find anagrams of a given word in a file

Alright so for class we have this problem where we need to be able to input a word and from a given text file (wordlist.txt) a list will be made using any anagrams of that word found in the file.
My code so far looks like this:
def find_anagrams1(string):
"""Takes a string and returns a list of anagrams for that string from the wordlist.txt file.
string -> list"""
anagrams = []
file = open("wordlist.txt")
next = file.readline()
while next != "":
isit = is_anagram(string, next)
if isit is True:
anagrams.append(next)
next = file.readline()
file.close()
return anagrams
Every time I try to run the program it just returns an empty list, despite the fact that I know there are anagrams present. Any ideas on what's wrong?
P.S. The is_anagram function looks like this:
def is_anagram(string1, string2):
"""Takes two strings and returns True if the strings are anagrams of each other.
list,list -> string"""
a = sorted(string1)
b = sorted(string2)
if a == b:
return True
else:
return False
I am using Python 3.4
The problem is that you are using the readline function. From the documentation:
file.readline = readline(...)
readline([size]) -> next line from the file, as a string.
Retain newline. A non-negative size argument limits the maximum
number of bytes to return (an incomplete line may be returned then).
Return an empty string at EOF.
The key information here is "Retain newline". That means that if you have a file containing a list of words, one per line, each word is going to be returned with a terminal newline. So when you call:
next = file.readline()
You're not getting example, you're getting example\n, so this will never match your input string.
A simple solution is to call the strip() method on the lines read from the file:
next = file.readline().strip()
while next != "":
isit = is_anagram(string, next)
if isit is True:
anagrams.append(next)
next = file.readline().strip()
file.close()
However, there are several problems with this code. To start with, file is a terrible name for a variable, because this will mask the python file module.
Rather than repeatedly calling readline(), you're better off taking advantage of the fact that an open file is an iterator which yields the lines of the file:
words = open('wordlist.txt')
for word in words:
word = word.strip()
isit = is_anagram(string, word)
if isit:
anagrams.append(word)
words.close()
Note also here that since is_anagram returns True or False, you
don't need to compare the result to True or False (e.g., if isit
is True). You can simply use the return value on its own.
Yikes, don't use for loops:
import collections
def find_anagrams(x):
anagrams = [''.join(sorted(list(i))) for i in x]
anagrams_counts = [item for item, count in collections.Counter(anagrams).items() if count > 1]
return [i for i in x if ''.join(sorted(list(i))) in anagrams_counts]
Here's another solution, that I think is quite elegant. This runs in O(n * m) where n is the number of words and m is number of letters (or average number of letters/word).
# anagarams.py
from collections import Counter
import urllib.request
def word_hash(word):
return frozenset(Counter(word).items())
def download_word_file():
url = 'https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt'
urllib.request.urlretrieve(url, 'words.txt')
def read_word_file():
with open('words.txt') as f:
words = f.read().splitlines()
return words
if __name__ == "__main__":
# downloads a file to your working directory
download_word_file()
# reads file into memory
words = read_word_file()
d = {}
for word in words:
k = word_hash(word)
if k in d:
d[k].append(word)
else:
d[k] = [word]
# Prints the filtered results to only words with anagrams
print([x for x in d.values() if len(x) > 1])

Read specfic character from line in Text File(Python)

char1= "P"
length=5
f = open("wl.txt", 'r')
for line in f:
if len(line)==length and line.rstrip() == char1:
z=Counter(line)
print z
I want to output only lines where length=5 and contains character p.So far
f = open("wl.txt", 'r')
for line in f:
if len(line)==length :#This one only works with the length
z=Counter(line)
print z
Any guess someone?
Your problem is:
if len(line)==length and line.rstrip() == char1:
If a line is 5 characters long, then after removing trailing whitespace, you're then comparing to see if it's equal to a string of length 1... 'abcde' is never going to equal 'p' for instance, and your check will never run if your line contains 'p' as it's not 5 characters...
I'm not sure what you're trying to do with Counter
Corrected code is:
# note in capitals to indicate 'constants'
LENGTH = 5
CHAR = 'p'
with open('wl.txt') as fin:
for line in fin:
# Check length *after* any trailing whitespace has been removed
# and that CHAR appears anywhere **in** the line
if len(line.rstrip()) == LENGTH and CHAR in line:
print 'match:', line

parsing a fasta file using a generator ( python )

I am trying to parse a large fasta file and I am encountering out of memory errors. Some suggestions to improve the data handling would be appreciated. Currently the program correctly prints out the names however partially through the file I get a MemoryError
Here is the generator
def readFastaEntry( fp ):
name = ""
seq = ""
for line in fp:
if line.startswith( ">" ):
tmp = []
tmp.append( name )
tmp.append( seq )
name = line
seq = ""
yield tmp
else:
seq = seq.join( line )
and here is the caller stub more will be added after this part works
fp = open( sys.argv[1], 'r' )
for seq in readFastaEntry( fp ) :
print seq[0]
For those not fimilar with the fasta format here is an example
>1 (PB2)
AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATAC
TCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCC
TGCACTCAGGATGAAGTGGATGATG
>2 (PB1)
AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACC
ACATTTCCCTATACTGGAGACCCTCC
each entry starts with a ">" stating the name etc then the next N lines are data. There is no defined ending of the data other than the next line having a ">" at the beginning.
Have you considered using BioPython. They have a sequence reader that can read fasta files. And if you are interested in coding one yourself, you can take a look at BioPython's code.
Edit: Code added
def read_fasta(fp):
name, seq = None, []
for line in fp:
line = line.rstrip()
if line.startswith(">"):
if name: yield (name, ''.join(seq))
name, seq = line, []
else:
seq.append(line)
if name: yield (name, ''.join(seq))
with open('f.fasta') as fp:
for name, seq in read_fasta(fp):
print(name, seq)
A pyparsing parser for this format is only a few lines long. See the annotations in the following code:
data = """>1 (PB2)
AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATAC
TCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCC
TGCACTCAGGATGAAGTGGATGATG
>2 (PB1)
AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACC
ACATTTCCCTATACTGGAGACCCTCC"""
from pyparsing import Word, nums, QuotedString, Combine, OneOrMore
# define some basic forms
integer = Word(nums)
key = QuotedString("(", endQuoteChar=")")
# sequences are "words" made up of the characters A, G, C, and T
# we want to match one or more of them, and have the parser combine
# them into a single string (Combine by default requires all of its
# elements to be adjacent within the input string, but we want to allow
# for the intervening end of lines, so we add adjacent=False)
sequence = Combine(OneOrMore(Word("AGCT")), adjacent=False)
# define the overall pattern to scan for - attach results names
# to each matched element
seqEntry = ">" + integer("index") + key("key") + sequence("sequence")
for seq,s,e in seqEntry.scanString(data):
# just dump out the matched data
print seq.dump()
# could also access fields as seq.index, seq.key and seq.sequence
Prints:
['>', '1', 'PB2', 'AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATACTCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCCTGCACTCAGGATGAAGTGGATGATG']
- index: 1
- key: PB2
- sequence: AATATATTCAATATGGAGAGAATAAAAGAACTAAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATACTCACCAAAACCACTGTGGACCACATGGCCATAATCAAAAAGTACACATCAGGAAGGCAAGAGAAGAACCCTGCACTCAGGATGAAGTGGATGATG
['>', '2', 'PB1', 'AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACCACATTTCCCTATACTGGAGACCCTCC']
- index: 2
- key: PB1
- sequence: AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACCACATTTCCCTATACTGGAGACCCTCC
Without having a great understanding of what you are doing, I would have written the code like this:
def readFastaEntry( fp ):
name = ""
while True:
line = name or f.readline()
if not line:
break
seq = []
while True:
name = f.readline()
if not name or name.startswith(">"):
break
else:
seq.append(name)
yield (line, "".join(seq))
This gathers up the data after a starting line up to the next starting line. Making seq an array means that you minimize the string joining until the last possible moment. Yielding a tuple makes more sense than a list.
def read_fasta(filename):
name = None
with open(filename) as file:
for line in file:
if line[0] == ">":
if name:
yield (name, seq)
name = line[1:-1].split("|")[0]
seq = ""
else:
seq += line[:-1]
yield (name, seq)

Categories

Resources