There's a link on our website that leads to a zip folder. The line in the HTML file for it is shown thus:
<p>Address Points (updated weekly)</p>
The zip folder's name will soon be changed using the current date so that it looks like this:
WillCounty_AddressPoint_02212018.zip
How do I change the corresponding line in the HTML?
Using this answer I have a script. It runs with no errors but does not change anything in the HTML file.
import bs4
from bs4 import BeautifulSoup
import re
import time
data = r'\\gisfile\GISstaff\Jared\data.html' #html file location
current_time = time.strftime("_%m%d%Y") #date
#load the file
with open(data) as inf:
txt = inf.read()
soup = bs4.BeautifulSoup(txt)
#create new link
new_link = soup.new_tag('link', href="Data/WillCounty_AddressPoint_%m%d%Y.zip")
#insert it into the document
soup.head.append(new_link)
#save the file again
with open (data, "w") as outf:
outf.write(str(soup))
This is how you could use BeautifulSoup to replace the href attribute.
from bs4 import BeautifulSoup
import time
data = r'data.html' #html file location
#load the file
current_time = time.strftime("_%m%d%Y")
with open(data) as inf:
txt = inf.read()
soup = BeautifulSoup(txt, 'html.parser')
a = soup.find('a')
a['href'] = ("WillCounty_AddressPoint%s.zip" % current_time)
print (soup)
#save the file again
with open (data, "w") as outf:
outf.write(str(soup))
Outputs:
<p>Address Points (updated weekly)</p>
And writes to the file
UPDATED to use data from supplied file.
from bs4 import BeautifulSoup
import time
data = r'data.html' #html file location
#load the file
current_time = time.strftime("_%m%d%Y")
with open(data) as inf:
txt = inf.read()
soup = BeautifulSoup(txt, 'html.parser')
# Find the a element you want to change by finding it's text and selecting parent.
a = soup.find(text="Address Points").parent
a['href'] = ("WillCounty_AddressPoint%s.zip" % current_time)
print (soup)
#save the file again
with open (data, "w") as outf:
outf.write(str(soup))
It will however, take out blank lines but otherwise leave your HTML code exactly as it was.
Using a diff tool to see differences in the original and modified files:
diff data\ \(copy\).html data.html
77c77
< <p>Address Points (updated weekly)</p>
---
> <p>Address Points (updated weekly)</p>
116,120d115
<
<
<
<
<
154d148
<
Related
import os
from bs4 import BeautifulSoup
import re
import pandas as pd
import glob
os.chdir(r"C:\\Users\*\*")
directory = os.getcwd()
for filename in os.listdir(directory):
if filename.endswith('.html'):
fname = os.path.join(directory, filename)
print("Current file name ..", os.path.abspath(fname))
with open(fname, "r", encoding = "ISO-8859-1") as file:
def find_tags_from_class(html):
soup = BeautifulSoup(html, "html.parser")
tags = soup.find_all("div", class_="item-ia")
final = pd.DataFrame()
for tag in tags:
if 'data-id' in tag.attrs:
attri=['data-id', 'data-mediatype', 'data-year']
tag.attrib=attri
results = tag.attrs
df = pd.DataFrame(results)
# pd.concat([df], ignore_index=True)
final.to_csv('finalbooks.csv', encoding = "ISO-8859-1")
final = final.append(df)
print(results, sep='\t')
find_tags_from_class(file)
I tried to solved it as a overwriting problem but not succeeded, suggest me a proper code that solve the data loss problem at least thousands of lines are there on (print command) screen but exact 78 lines are being written in csv. also suggest better way to code this whole coding . Thanks all
My current code is cutting first 6 characters from file names while downloading PDF's. So for example PDF file name is 123456acII.pdf (https://example.com/wp-content/uploads/2016/11/123456acII.pdf) but file in folder is acII.pdf.
How to make names be as they are?
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
main = "https://example.com/"
#If there is no such folder, the script will create one automatically
folder_location = r'C:\temp\webscraping'
if not os.path.exists(folder_location):os.mkdir(folder_location)
def Get_Links():
r = requests.get(main).text
soup = BeautifulSoup(r, 'html.parser')
links = []
for item in soup.findAll("div", {'class': 'large-4 medium-4 columns'}):
for n in item.find_all('a'):
print ('Link: '+ n.get('href'))
links.append(n.get('href'))
return links
def Parse_Links():
pdf = set()
for url in Get_Links():
r = requests.get(url).text
soup = BeautifulSoup(r, 'html.parser')
for item in soup.findAll("div", {'class': 'large-6 medium-8 columns large-centered'}):
for link in item.findAll("a"):
link = link.get("href")
if link:
pdf.add(link)
return pdf
def Save():
for item in Parse_Links():
print(f"Downloading File: {item[55:]}")
filename = os.path.join(folder_location,f"{item[55:]}")
r = requests.get(item)
with open(filename, 'wb') as f:
f.write(r.content)
print("done")
Save()
It looks like you are slicing the string starting at index position 55 {item[55:]}. Try to see if it's simply just starting your index position 6 positions prior:
change to: {item[49:]}
At first, i tried to open just a file named 'index.html', parse it and saved it as a csv file. This was the code and it worked well. enter image description here
with open('/Users/kwon/Downloads/cnn/index.html') as html_file:
soup = BeautifulSoup(html_file, 'html.parser')
cnn_file = open('cnn2.csv', 'w')
cnn_writer = csv.writer(cnn_file)
cnn_writer.writerow(['filename','date','headline','text'])
filename = 'index000'
print(filename)
date = soup.find(class_='update-time').text
date = date.split(' ')[5]+' '+date.split(' ')[6]+' '+date.split(' ')[7]
print(date)
headline = soup.title.text
headline = headline.split('-')[0]
print(headline)
txt = soup.find(class_="zn zn-body-text zn-body zn--idx-0 zn--ordinary zn-has-multiple-containers zn-has-r'\d*'-containers").text
print(txt)
cnn_writer.writerow([filename, date, headline, txt])
cnn_file.close()
But i want to iterate the same process for all html files(index.html~index591.html) in a directory folder. So i started by using glob module to open files sequentially. Then, tried 'for loop' to parse as i did before. Somehow i don't know how to read and parse them sequentially and name filename as 'index000' to 'index591'. Also if i run the code below i get the error saying 'find() takes no keyword arguments'.
import glob
path = '/Users/kwon-yejin/Downloads/cnn2/*.html'
files=glob.glob(path)
for file in files:
html = open(file, 'r')
soup = bs4.BeautifulSoup(html, 'html.parser')
for line in soup:
filename = 'index000'
print(filename)
date = line.find(class_='update-time').text
date = date.split(' ')[5]+' '+date.split(' ')[6]+' '+date.split(' ')[7]
print(date)
headline = line.title.text
headline = headline.split('-')[0]
print(headline)
txt = line.find(class_="zn zn-body-text zn-body zn--idx-0 zn--ordinary zn-has-multiple-containers zn-has-21-containers").text
print(txt)
Sread and parse them sequentially filename as 'index000' to 'index591'
path = '/Users/kwon-yejin/Downloads/cnn2/'
for i in range(592):
file = path+'index'+str(i).zfill(3)+'.html'
print(file)
/Users/kwon-yejin/Downloads/cnn2/index000.html
/Users/kwon-yejin/Downloads/cnn2/index001.html
/Users/kwon-yejin/Downloads/cnn2/index002.html
/Users/kwon-yejin/Downloads/cnn2/index003.html
..................
/Users/kwon-yejin/Downloads/cnn2/index589.html
/Users/kwon-yejin/Downloads/cnn2/index590.html
/Users/kwon-yejin/Downloads/cnn2/index591.html
I'm trying to scrape a forum discussion and export it as a csv file, with rows such as "thread title", "user", and "post", where the latter is the actual forum post from each individual.
I'm a complete beginner with Python and BeautifulSoup so I'm having a really hard time with this!
My current problem is that all the text is split into one character per row in the csv file. Is there anyone out there who can help me out? It would be fantastic if someone could give me a hand!
Here's the code I've been using:
from bs4 import BeautifulSoup
import csv
import urllib2
f = urllib2.urlopen("https://silkroad5v7dywlc.onion.to/index.php?action=printpage;topic=28536.0")
soup = BeautifulSoup(f)
b = soup.get_text().encode("utf-8").strip() #the posts contain non-ascii words, so I had to do this
writer = csv.writer(open('silkroad.csv', 'w'))
writer.writerows(b)
Ok here we go. Not quite sure what I'm helping you do here, but hopefully you have a good reason to be analyzing silk road posts.
You have a few issues here, the big one is that you aren't parsing the data at all. What you're essentially doing with .get_text() is going to the page, highlighting the whole thing, and then copying and pasting the whole thing to a csv file.
So here is what you should be trying to do:
Read the page source
Use soup to break it into sections you want
Save sections in parallel arrays for author, date, time, post, etc
Write data to csv file row by row
I wrote some code to show you what that looks like, it should do the job:
from bs4 import BeautifulSoup
import csv
import urllib2
# get page source and create a BeautifulSoup object based on it
print "Reading page..."
page = urllib2.urlopen("https://silkroad5v7dywlc.onion.to/index.php?action=printpage;topic=28536.0")
soup = BeautifulSoup(page)
# if you look at the HTML all the titles, dates,
# and authors are stored inside of <dt ...> tags
metaData = soup.find_all("dt")
# likewise the post data is stored
# under <dd ...>
postData = soup.find_all("dd")
# define where we will store info
titles = []
authors = []
times = []
posts = []
# now we iterate through the metaData and parse it
# into titles, authors, and dates
print "Parsing data..."
for html in metaData:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
titles.append(text.split("Title:")[1].split("Post by:")[0].strip()) # get Title:
authors.append(text.split("Post by:")[1].split(" on ")[0].strip()) # get Post by:
times.append(text.split(" on ")[1].strip()) # get date
# now we go through the actual post data and extract it
for post in postData:
posts.append(BeautifulSoup(str(post)).get_text().encode("utf-8").strip())
# now we write data to csv file
# ***csv files MUST be opened with the 'b' flag***
csvfile = open('silkroad.csv', 'wb')
writer = csv.writer(csvfile)
# create template
writer.writerow(["Time", "Author", "Title", "Post"])
# iterate through and write all the data
for time, author, title, post in zip(times, authors, titles, posts):
writer.writerow([time, author, title, post])
# close file
csvfile.close()
# done
print "Operation completed successfully."
EDIT: Included solution that can read files from directory and use data from that
Okay, so you have your HTML files in a directory. You need to get a list of files in the directory, iterate through them, and append to your csv file for each file in the directory.
This is the basic logic of our new program.
If we had a function called processData() that took a file path as an argument and appended data from the file to your csv file here is what it would look like:
# the directory where we have all our HTML files
dir = "myDir"
# our csv file
csvFile = "silkroad.csv"
# insert the column titles to csv
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Time", "Author", "Title", "Post"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # increment counter
As it happens our processData() function is more or less what we did before, with a few changes.
So this is very similar to our last program, with a few small changes:
We write the column headers first thing
Following that we open the csv with the 'ab' flag to append
We import os to get a list of files
Here's what that looks like:
from bs4 import BeautifulSoup
import csv
import urllib2
import os # added this import to process files/dirs
# ** define our data processing function
def processData( pageFile ):
''' take the data from an html file and append to our csv file '''
f = open(pageFile, "r")
page = f.read()
f.close()
soup = BeautifulSoup(page)
# if you look at the HTML all the titles, dates,
# and authors are stored inside of <dt ...> tags
metaData = soup.find_all("dt")
# likewise the post data is stored
# under <dd ...>
postData = soup.find_all("dd")
# define where we will store info
titles = []
authors = []
times = []
posts = []
# now we iterate through the metaData and parse it
# into titles, authors, and dates
for html in metaData:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
titles.append(text.split("Title:")[1].split("Post by:")[0].strip()) # get Title:
authors.append(text.split("Post by:")[1].split(" on ")[0].strip()) # get Post by:
times.append(text.split(" on ")[1].strip()) # get date
# now we go through the actual post data and extract it
for post in postData:
posts.append(BeautifulSoup(str(post)).get_text().encode("utf-8").strip())
# now we write data to csv file
# ***csv files MUST be opened with the 'b' flag***
csvfile = open('silkroad.csv', 'ab')
writer = csv.writer(csvfile)
# iterate through and write all the data
for time, author, title, post in zip(times, authors, titles, posts):
writer.writerow([time, author, title, post])
# close file
csvfile.close()
# ** start our process of going through files
# the directory where we have all our HTML files
dir = "myDir"
# our csv file
csvFile = "silkroad.csv"
# insert the column titles to csv
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Time", "Author", "Title", "Post"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # incriment counter
I have a directory of downloaded HTML files (46 of them) and I am attempting to iterate through each of them, read their contents, strip the HTML, and append only the text into a text file. However, I'm unsure where I'm messing up, though, as nothing gets written to my text file?
import os
import glob
from bs4 import BeautifulSoup
path = "/"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (path)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
f.close()
-----update----
I've updated my code as below, however the text file still doesn't get created.
import os
import glob
from bs4 import BeautifulSoup
path = "/"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
myfile.close()
-----update 2-----
Ah, I caught that I had my directory incorrect, so now I have:
import os
import glob
from bs4 import BeautifulSoup
path = "c:\\users\\me\\downloads\\"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
myfile.close()
When this is executed, I get this error:
Traceback (most recent call last):
File "C:\Users\Me\Downloads\bsoup.py, line 11 in <module>
myfile.write(soup)
TypeError: must be str, not BeautifulSoup
I fixed this last error by changing
myfile.write(soup)
to
myfile.write(soup.get_text())
-----update 3 ----
It's working properly now, here's the working code:
import os
import glob
from bs4 import BeautifulSoup
path = "c:\\users\\me\\downloads\\"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(open(markup, "r").read())
with open("example.txt", "a") as myfile:
myfile.write(soup.get_text())
myfile.close()
actually you are not reading html file, this should work,
soup=BeautifulSoup(open(webpage,'r').read(), 'lxml')
If you want to use lxml.html directly here is a modified version of some code I've been using for a project. If you want to grab all the text, just don't filter by tag. There may be a way to do it without iterating, but I don't know. It saves the data as unicode, so you will have to take that into account when opening the file.
import os
import glob
import lxml.html
path = '/'
# Whatever tags you want to pull text from.
visible_text_tags = ['p', 'li', 'td', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'a', 'div', 'span']
for infile in glob.glob(os.path.join(path, "*.html")):
doc = lxml.html.parse(infile)
file_text = []
for element in doc.iter(): # Iterate once through the entire document
try: # Grab tag name and text (+ tail text)
tag = element.tag
text = element.text
tail = element.tail
except:
continue
words = None # text words split to list
if tail: # combine text and tail
text = text + " " + tail if text else tail
if text: # lowercase and split to list
words = text.lower().split()
if tag in visible_text_tags:
if words:
file_text.append(' '.join(words))
with open('example.txt', 'a') as myfile:
myfile.write(' '.join(file_text).encode('utf8'))