Iterate through multiple files and append text from HTML using Beautiful Soup - python

I have a directory of downloaded HTML files (46 of them) and I am attempting to iterate through each of them, read their contents, strip the HTML, and append only the text into a text file. However, I'm unsure where I'm messing up, though, as nothing gets written to my text file?
import os
import glob
from bs4 import BeautifulSoup
path = "/"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (path)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
f.close()
-----update----
I've updated my code as below, however the text file still doesn't get created.
import os
import glob
from bs4 import BeautifulSoup
path = "/"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
myfile.close()
-----update 2-----
Ah, I caught that I had my directory incorrect, so now I have:
import os
import glob
from bs4 import BeautifulSoup
path = "c:\\users\\me\\downloads\\"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
myfile.close()
When this is executed, I get this error:
Traceback (most recent call last):
File "C:\Users\Me\Downloads\bsoup.py, line 11 in <module>
myfile.write(soup)
TypeError: must be str, not BeautifulSoup
I fixed this last error by changing
myfile.write(soup)
to
myfile.write(soup.get_text())
-----update 3 ----
It's working properly now, here's the working code:
import os
import glob
from bs4 import BeautifulSoup
path = "c:\\users\\me\\downloads\\"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(open(markup, "r").read())
with open("example.txt", "a") as myfile:
myfile.write(soup.get_text())
myfile.close()

actually you are not reading html file, this should work,
soup=BeautifulSoup(open(webpage,'r').read(), 'lxml')

If you want to use lxml.html directly here is a modified version of some code I've been using for a project. If you want to grab all the text, just don't filter by tag. There may be a way to do it without iterating, but I don't know. It saves the data as unicode, so you will have to take that into account when opening the file.
import os
import glob
import lxml.html
path = '/'
# Whatever tags you want to pull text from.
visible_text_tags = ['p', 'li', 'td', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'a', 'div', 'span']
for infile in glob.glob(os.path.join(path, "*.html")):
doc = lxml.html.parse(infile)
file_text = []
for element in doc.iter(): # Iterate once through the entire document
try: # Grab tag name and text (+ tail text)
tag = element.tag
text = element.text
tail = element.tail
except:
continue
words = None # text words split to list
if tail: # combine text and tail
text = text + " " + tail if text else tail
if text: # lowercase and split to list
words = text.lower().split()
if tag in visible_text_tags:
if words:
file_text.append(' '.join(words))
with open('example.txt', 'a') as myfile:
myfile.write(' '.join(file_text).encode('utf8'))

Related

cant write csv file properly using bs4 in python for local html, only 4kb(tried differently so many times ) being sent to csv instead of large data

import os
from bs4 import BeautifulSoup
import re
import pandas as pd
import glob
os.chdir(r"C:\\Users\*\*")
directory = os.getcwd()
for filename in os.listdir(directory):
if filename.endswith('.html'):
fname = os.path.join(directory, filename)
print("Current file name ..", os.path.abspath(fname))
with open(fname, "r", encoding = "ISO-8859-1") as file:
def find_tags_from_class(html):
soup = BeautifulSoup(html, "html.parser")
tags = soup.find_all("div", class_="item-ia")
final = pd.DataFrame()
for tag in tags:
if 'data-id' in tag.attrs:
attri=['data-id', 'data-mediatype', 'data-year']
tag.attrib=attri
results = tag.attrs
df = pd.DataFrame(results)
# pd.concat([df], ignore_index=True)
final.to_csv('finalbooks.csv', encoding = "ISO-8859-1")
final = final.append(df)
print(results, sep='\t')
find_tags_from_class(file)
I tried to solved it as a overwriting problem but not succeeded, suggest me a proper code that solve the data loss problem at least thousands of lines are there on (print command) screen but exact 78 lines are being written in csv. also suggest better way to code this whole coding . Thanks all

Extract text from multiple .html files and save them in seperate txt files

files list
I have tried to extract text content from 24 folders, every folder has serval (100+) .HTML files, I need to create 24 .txt files to save text which extracts from .HTML files.
I have done most of the job except saving .txt files, the code I wrote save 24 files all with the same content, I know something wrong in the following part
for number in range(1,25):
with open('Text'+"%02d" % number +" .txt", "w", encoding='utf-8') as text:
for i in passage:
text.write(i+' ')
All code is listed below
# Read files and call functions
from bs4 import BeautifulSoup
import os
import numpy as np
gap_html = os.listdir('gap-html')
print(gap_html)
# print(folder)
passage = list()
# out = "all.txt"
# def Convertfile():
for textFolders in gap_html:
# domain = os.path.abspath(r'../gap-html')
folder = os.path.join(os.path.abspath('gap-html'), textFolders)
# text_folder=os.path.abspath(folder)
# Lists the file names under all folders
textFiles=os.listdir(folder)
for textFile in textFiles :
file=os.path.join(os.path.abspath(folder), textFile)
print(file)
html = open(file, 'r', encoding="utf-8").read()
# print("Start reading file...")
soup = BeautifulSoup(html, features='lxml')
page = soup.find_all('span', {"class": "ocr_cinfo"})
for word in page:
word = word.get_text()
passage.append(word)
for number in range(1,25):
with open('Text'+"%02d" % number +" .txt", "w", encoding='utf-8') as text:
for i in passage:
text.write(i+' ')

I want to open html files in a directory folder sequentially, parse information by using beautifulsoup and save it as a csv file

At first, i tried to open just a file named 'index.html', parse it and saved it as a csv file. This was the code and it worked well. enter image description here
with open('/Users/kwon/Downloads/cnn/index.html') as html_file:
soup = BeautifulSoup(html_file, 'html.parser')
cnn_file = open('cnn2.csv', 'w')
cnn_writer = csv.writer(cnn_file)
cnn_writer.writerow(['filename','date','headline','text'])
filename = 'index000'
print(filename)
date = soup.find(class_='update-time').text
date = date.split(' ')[5]+' '+date.split(' ')[6]+' '+date.split(' ')[7]
print(date)
headline = soup.title.text
headline = headline.split('-')[0]
print(headline)
txt = soup.find(class_="zn zn-body-text zn-body zn--idx-0 zn--ordinary zn-has-multiple-containers zn-has-r'\d*'-containers").text
print(txt)
cnn_writer.writerow([filename, date, headline, txt])
cnn_file.close()
But i want to iterate the same process for all html files(index.html~index591.html) in a directory folder. So i started by using glob module to open files sequentially. Then, tried 'for loop' to parse as i did before. Somehow i don't know how to read and parse them sequentially and name filename as 'index000' to 'index591'. Also if i run the code below i get the error saying 'find() takes no keyword arguments'.
import glob
path = '/Users/kwon-yejin/Downloads/cnn2/*.html'
files=glob.glob(path)
for file in files:
html = open(file, 'r')
soup = bs4.BeautifulSoup(html, 'html.parser')
for line in soup:
filename = 'index000'
print(filename)
date = line.find(class_='update-time').text
date = date.split(' ')[5]+' '+date.split(' ')[6]+' '+date.split(' ')[7]
print(date)
headline = line.title.text
headline = headline.split('-')[0]
print(headline)
txt = line.find(class_="zn zn-body-text zn-body zn--idx-0 zn--ordinary zn-has-multiple-containers zn-has-21-containers").text
print(txt)
Sread and parse them sequentially filename as 'index000' to 'index591'
path = '/Users/kwon-yejin/Downloads/cnn2/'
for i in range(592):
file = path+'index'+str(i).zfill(3)+'.html'
print(file)
/Users/kwon-yejin/Downloads/cnn2/index000.html
/Users/kwon-yejin/Downloads/cnn2/index001.html
/Users/kwon-yejin/Downloads/cnn2/index002.html
/Users/kwon-yejin/Downloads/cnn2/index003.html
..................
/Users/kwon-yejin/Downloads/cnn2/index589.html
/Users/kwon-yejin/Downloads/cnn2/index590.html
/Users/kwon-yejin/Downloads/cnn2/index591.html

python: change data hyperlink in HTML file

There's a link on our website that leads to a zip folder. The line in the HTML file for it is shown thus:
<p>Address Points (updated weekly)</p>
The zip folder's name will soon be changed using the current date so that it looks like this:
WillCounty_AddressPoint_02212018.zip
How do I change the corresponding line in the HTML?
Using this answer I have a script. It runs with no errors but does not change anything in the HTML file.
import bs4
from bs4 import BeautifulSoup
import re
import time
data = r'\\gisfile\GISstaff\Jared\data.html' #html file location
current_time = time.strftime("_%m%d%Y") #date
#load the file
with open(data) as inf:
txt = inf.read()
soup = bs4.BeautifulSoup(txt)
#create new link
new_link = soup.new_tag('link', href="Data/WillCounty_AddressPoint_%m%d%Y.zip")
#insert it into the document
soup.head.append(new_link)
#save the file again
with open (data, "w") as outf:
outf.write(str(soup))
This is how you could use BeautifulSoup to replace the href attribute.
from bs4 import BeautifulSoup
import time
data = r'data.html' #html file location
#load the file
current_time = time.strftime("_%m%d%Y")
with open(data) as inf:
txt = inf.read()
soup = BeautifulSoup(txt, 'html.parser')
a = soup.find('a')
a['href'] = ("WillCounty_AddressPoint%s.zip" % current_time)
print (soup)
#save the file again
with open (data, "w") as outf:
outf.write(str(soup))
Outputs:
<p>Address Points (updated weekly)</p>
And writes to the file
UPDATED to use data from supplied file.
from bs4 import BeautifulSoup
import time
data = r'data.html' #html file location
#load the file
current_time = time.strftime("_%m%d%Y")
with open(data) as inf:
txt = inf.read()
soup = BeautifulSoup(txt, 'html.parser')
# Find the a element you want to change by finding it's text and selecting parent.
a = soup.find(text="Address Points").parent
a['href'] = ("WillCounty_AddressPoint%s.zip" % current_time)
print (soup)
#save the file again
with open (data, "w") as outf:
outf.write(str(soup))
It will however, take out blank lines but otherwise leave your HTML code exactly as it was.
Using a diff tool to see differences in the original and modified files:
diff data\ \(copy\).html data.html
77c77
< <p>Address Points (updated weekly)</p>
---
> <p>Address Points (updated weekly)</p>
116,120d115
<
<
<
<
<
154d148
<

for loop returning only last item in the list

url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
links=[]
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
links.append(a.get('href'))
output = open(outputfile, "wb")
for i in links:
request=urllib.urlopen(i)
read=request.read()
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(outputfile,'r')
zip_ref.extractall('./data/')
zip_ref.close()
I have a url's stored in a list. I am supplying it to urllib. Each url ends with .zip extension. When I run this code I get only the last file downloaded from the list. There are about >400 links to be downloaded.
Am I missing something?
So you write all you files into one, that's not gonna work
Try this
import os
url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
links=[]
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
links.append(a.get('href'))
for i in links:
request=urllib.urlopen(i)
read=request.read()
file_name = os.path.basename(i)
output = open(file_name, "wb")
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(file_name,'r')
zip_ref.extractall('./data/')
zip_ref.close()
Option 2
import os
url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
def download_and_extract(link):
request=urllib.urlopen(link)
read=request.read()
file_name = os.path.basename(link)
output = open(file_name, "wb")
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(file_name,'r')
zip_ref.extractall('./data/')
zip_ref.close()
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
download_and_extract(a.get('href'))

Categories

Resources