The purpose is to search an infile (html) and reproduce the URLs of any images in an outfile which could be passed to wget. This would be the first useful thing I've written in Python and it appears to work well on Fedora. I couldn't find anything specifically like this anywhere. Does anyone have suggestions for improving on this?
import fileinput
import re
#replace 'output.txt' with the name of your outfile
file = open('output.txt', 'w')
#prefix and postfix are how we discriminate your substring from the infile's line
prefix = '<img src='
postfix = '.jpg'
#read through the infile line-by-line
for line in fileinput.input():
if re.search(prefix, line):
#from if above, if you find the prefix, assign the integer to first_index
first_index = line.index(prefix)
if re.search(postfix, line):
#same as comment above, but for postfix
second_index = line.index(postfix)
#write your string plus an newline to the outfile
file.write(line[first_index+prefix.__len__():second_index+postfix.__len__()]+'\n')
I've done something like this in the past and it worked pretty well... I'm sure it will be more accurate than trying to parse with regex.
from HTMLParser import HTMLParser
class ImageFinder(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.file = open('output.txt', 'w')
def handle_starttag(self, tag, attrs):
if tag == "img":
url = [u[1] for u in attrs if u[0] == "src"][0]
self.file.write(url+"\n")
def __exit__(self):
self.file.close()
inputdata = open("myfile.txt").read()
parser = ImageFinder()
parser.feed(inputdata)
Related
I am having difficulty getting the right code to parse out chapters from this ebook and then have the 27 chapters to print out into their own text file. the farthest i've gotten is to print "CHAPTER-1.txt". I don't want to hard code anything and am unsure where i've completely missed the mark.
infile = open('dracula.txt', 'r')
readlines = infile.readlines()
toc_list = readlines[74:185]
toc_text_lines = []
for line in toc_list:
if len(line) > 1:
stripped_line = line.strip()
toc_text_lines.append(stripped_line)
#print(len(toc_text_lines))
chaptitles = []
for text_lines in toc_text_lines:
split_text_line = text_lines.split()
if split_text_line[-1].isdigit():
chaptitles.append(text_lines)
#print(len(chaptitles))
print(chaptitles)
infile.close()
import re
with open('dracula.txt') as f:
book = f.readlines()
while book:
line = book.pop(0)
if "CHAPTER" in line and book.pop(0) == '\n':
for title in chapters_names_list: ['CHAPTER I.', 'CHAPTER II.',
'CHAPTER III.']
with open("{}.txt".format(chapters_names_list), 'w') :
I think you could benefit from generators, suppose one of the ebooks is too big to fit into memory, you will have some issues.
What you can do is construct sort of a data processing pipeline, first look for the file(ebook.txt) in the filesystem, though have in mind that we need all functions to be as general as possible, once we have the filename we open it and yield one line at a time, and finally we scan each line for 'CHAPTER I.', 'CHAPTER II.', etc
import os
import re
import fnmatch
def find_files(pattern, path):
"""
Here you can find all the filenames that match a specific pattern
using shell wildcard pattern that way you avoid hardcoding
the file pattern i.e 'dracula.txt'
"""
for root, dirs, files in os.walk(path):
for name in fnmatch.filter(files, pattern):
yield os.path.join(root, name)
def file_opener(filenames):
"""
Open a sequence of filenames one at a time
and make sure to close the file once we are done
scanning its content.
"""
for filename in filenames:
if filename.endswith('.txt'):
f = open(filename, 'rt')
yield f
f.close()
def chain_generators(iterators):
"""
Chain a sequence of iterators together
"""
for it in iterators:
# Look up yield from if you're unsure what it does
yield from it
def grep(pattern, lines):
"""
Look for a pattern in a line i.e 'CHAPTER I.'
"""
pat = re.compile(pattern)
for line in lines:
if pat.search(line):
yield line
# A simple way to use these functions together
logs = find_files('dracula*', 'Path/to/files')
files = file_opener(logs)
lines = chain_generators(files)
each_line = grep('CHAPTER I.', lines)
for match in each_line:
print(match)
You can build on top of these implementation to accomplish what you're trying to do.
Let me know if this helped.
For the function below I'm not getting any errors but no files appear in the same directory as where this .py is saved. Does anyone have any ideas for why this is?
import itertools
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
def get_rid_of_html():
print("removing html tags")
with open('messages.htm', "r", encoding="utf8") as f:
lines = f.read()
print(len(lines))
lines_removed_html = remove_tags(lines)
with open('messages_txt_remove_list.txt', 'w', encoding="utf8") as f:
f.write(lines_removed_html)
get_rid_of_html()
You are missing the regular expression module in the code. You need the module for re.compile(r'<[^>]+>'):
import re
TAG_RE = re.compile(r'<[^>]+>')
Also look here: https://docs.python.org/3/library/re.html
I have tried to create a python function which takes in 2 parameters; a file name and a search string. In this case the file name is the script itself (script.py) and the search string is 'name = "JOHN"'
#!/usr/local/bin/python2.7
import os, sys
#################
# Variable string
name = "JOHN"
#################
# Main function
def search_script_for_string(filename, searchString):
f = open(filename,'r') #open the given filename then
filedata = f.read() #assign it to variable then
f.close() #close the open filename
for lines in filedata: #loop through each line in the filedata variable
if searchString in lines: #if search string is found, then do all of this
print ('Found string: %s') % searchString
return True
else: #if not found, then do all of this
print ('Did not find: %s') % searchString
return False
break
#################
# Pass the file name and the search string parameter to the function
search_script_for_string("test.py","name = \"" + name + "\"")
The problem is that it doesn't return expected results:
$ Did not find: name = "JOHN"
When it meant to say:
$ Found string: name = "JOHN"
If anyone can help correct my understanding of where I'm going wrong here, I'd be massively appreciative. Thanks
f.read() returns the entire contents of the file as a single string. You then iterate over those contents -- but iterating over a string yields only 1 character at a time so there is no way a character will contain the substring you are looking for.
def search_script_for_string(filename, searchString):
with open(filename, 'r') as f:
return searchString in f.read()
should do the trick. Alternatively, if you want to search line-by-line:
def search_script_for_string(filename, searchString):
with open(filename, 'r') as f:
for line in f:
return searchString in line
You are iterating over each character of the file by calling for c in f.read().
Use for line in f and you will indeed iterate over each line.
Also prefer the use of with, this makes your code a lot more robust.
So this would be better:
with open('fileName') as f:
for line in f:
#process
This prints out the number of all the lines:
def links(htmlfile):
infile = open('twolinks.html', 'r')
content = infile.readlines()
infile.close()
return len(content)
print("# of lines: " + str(content.count('</a>')))
But I only need the number of lines which contain < / a > at the end.
The loop way:
with open('twolinks.html') as f:
count = 0
for line in f:
if line.endswith('</a>'):
count += 1
Using comprehension:
with open('twolinks.html') as f:
sum( 1 for line in f if line.endswith('</a>') )
Or even shorter (summing booleans, treating them as 0s and 1s):
sum( line.endswith('</a>') for line in f )
import re
with open('data') as f:
print(sum( 1 for line in f if re.search('</a>',line) ))
num_lines = sum(1 for line in open('file') if '</a>' in line)
print num_lines
I guess that my answer is a bit longer in terms of code lines, but why not use a HTML parser since you know that you are parsing HTML?
for instance:
from HTMLParser import HTMLParser
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.count = 0
def handle_endtag(self, tag):
if tag == "a":
self.count += 1
print "Encountered an end tag :", tag
print self.count
# instantiate the parser and fed it some HTML
parser = MyHTMLParser()
parser.feed('<html><head><title>Test</title></head>'
'<body><h1>Parse me!</h1><a></a></body></html>')
this is modified code from the python pages. This is then easier to modify if you find the need for collecting other tags, or data with tags etc.
Or you could do something like that:
count = 0
f = open("file.txt", "r")
for line in f:
if(line[-5:].rstrip('\n')=='</a>'):
count+=1
Worked great for me.
In general, you go through the file each line at a time,
and see it the last characters (without the \n) match </a>.
see if the \n striping gives you any trouble.
Currently, I'm trying to search for an exact word/phrase in a text file. I am using Python 3.4
Here is the code I have so far.
import re
def main():
fileName = input("Please input the file name").lower()
term = input("Please enter the search term").lower()
fileName = fileName + ".txt"
regex_search(fileName, term)
def regex_search(file,term):
source = open(file, 'r')
destination = open("new.txt", 'w')
lines = []
for line in source:
if re.search(term, line):
lines.append(line)
for line in lines:
destination.write(line)
source.close()
destination.close()
'''
def search(file, term): #This function doesn't work
source = open(file, 'r')
destination = open("new.txt", 'w')
lines = [line for line in source if term in line.split()]
for line in lines:
destination.write(line)
source.close()
destination.close()'''
main()
In my function regex_search I use regex to search for the particular string. However, I don't know how to search for a particular phrase.
In the second function, search, I split the line into a list and search for the word in there. However, this won't be able to search for a particular phrase because I am searching for ["dog walked"] in ['the','dog','walked'] which won't return the correct lines.
edit: Considering that you don't want to match partial words ('foo' should not match 'foobar'), you need to look ahead in the data stream. The code for that is a bit awkward, so I think regex (your current regex_search with a fix) is the way to go:
def regex_search(filename, term):
searcher = re.compile(term + r'([^\w-]|$)').search
with open(file, 'r') as source, open("new.txt", 'w') as destination:
for line in source:
if searcher(line):
destination.write(line)