This script uses beautiful soup to parse all pdf documents on a particular page of a web site. The script successfully downloads one file but will not download all the files that are returned. I need help making this download all pdf documents that I have parsed.
I have done research but have found no answers
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx? portalId=895956&pageId=1606144')
RFP_Import = ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')
def get_pdf_links():
r = requests.get(RFP_Import)
soup= BeautifulSoup(r.content, 'html5lib')
links = soup.find_all('a')
pdf_links = [place_hoder + link['href'] for link in links if link['href'].endswith('pdf')]
return pdf_links
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
print ('all RFPs downloaded!')
return
if __name__ == "__main__":
pdf_links = get_pdf_links()
download_pdf_links(pdf_links)
Successfully downloads first pdf document and then stops.
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx? portalId=895956&pageId=1606144')
RFP_Import = ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')
def get_pdf_links():
r = requests.get(RFP_Import)
soup= BeautifulSoup(r.content, 'html5lib')
links = soup.find_all('a')
pdf_links = [place_hoder + link['href'] for link in links if link['href'].endswith('pdf')]
return pdf_links
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
print ('all RFPs downloaded!')
return
if __name__ == "__main__":
pdf_links = get_pdf_links()
download_pdf_links(pdf_links)
Inside download_pdf_links(), return is misaligned. It should be aligned with for. Otherwise, it is part of the for cycle and the function terminates after the first iteration.
This is probably also true for print ('all RFPs downloaded!'). I guess you want that printed out at the end of the for cycle, after you have been through all links.
In download_pdf_link you are using return inside your loop, which will return after the first iteration of the loop and stop downloading files. You need to return after the loop finishes by putting it on the same indentation as the start of the loop like this:
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
# Un-indented so it happens after the loop finishes.
print ('all RFPs downloaded!')
return
Related
I am trying to download multiple .csv files from an url directory with similar names (AB_daily.csv, BC_daily.csv, etc.). However, each file is stored in different folders in the directory. I know there is a way to use a loop to extract the files, but I can't figure out how to do it with Beautiful Soup or glob. Do you have any suggestions? I've also used pandas.read_csv() to look for shortcuts as I'm just trying to concatenate the files together later. Thank you.
URL Directory: https://dd.weather.gc.ca/hydrometric/csv/
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [f"{url[:20]}{item['href']}" for item in soup.select(
"a[href$='AB_daily_hydrometric.csv']")
for x in target:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv')
# For this specific task this will work:
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
csv_links = []
links = ["AB/", "BC/", "MB/", "NB/", "NL/", "NS/", "NT/", "NU/" "ON/", " PE/", "QC/", "SK/", "YT/"]
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for item in links:
r_daily = BeautifulSoup(req.get(url + item + "daily/").content, 'html.parser')
r_hourly = BeautifulSoup(req.get(url + item + "hourly/").content, 'html.parser')
for item_d in r_daily.find_all('a'):
if ".csv" in item_d.get('href'):
csv_links.append(url + item + "daily/" +item_d.get('href'))
for item_h in r_hourly.find_all('a'):
if ".csv" in item_h.get('href'):
csv_links.append(url + item + "hourly/" + item_h.get('href'))
for x in csv_links:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv/')
I have some code that allows for the downloading of various comics off of xkcd. This code is gathered from Al Sweigart's book: Automate The Boring Stuff With Python with some minor edits made by me.
I understand most of what is going on. What's confusing id that the 'soup' BeautifulSoup object is made from a request named 'r' continues to get information from the page that can be used throughout the code even though 'r' is re-instantiated in the function 'download_image()'.
Even more confusing is that if the 'r' found in 'download_image()' is renamed to something other than 'r', the code will break.
Code:
import requests
import os
import bs4
os.makedirs('xkcd', exist_ok=True)
page = input('What issue of xkcd would you like to download? (*all for all comics, *today for today\'s comic): ')
url = 'http://xkcd.com/'
def download_image():
comic_url = 'http:' + comic[0].get('src') # page with just the image
r = requests.get(comic_url) # switches to that page
# gets file with directory xkcd/name of comic
try:
issue_number = str(int(str(soup.select('a[rel="prev"]')[0].get('href'))[1:-1]) + 1)
except ValueError:
issue_number = '1'
name = os.path.basename(comic_url[:-4] + "_" + issue_number + ".png")
file = open(os.path.join('xkcd', name), 'wb')
print("Downloading image %s... " % name)
# writes to file
for chunk in r.iter_content(100000):
file.write(chunk)
file.close()
if page == '*all':
url = 'http://xkcd.com/5'
while not url.endswith('#'):
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'html.parser')
comic = soup.select('#comic img')
download_image()
prev_link = soup.select('a[rel="prev"]')[0]
url = 'http://xkcd.com/' + prev_link.get('href')
else:
if page == '*today':
page = ''
r = requests.get(url + page)
soup = bs4.BeautifulSoup(r.text, 'html.parser')
comic = soup.select('#comic img')
if not comic:
print("Comic not found.")
else:
download_image()
"""
r = requests.get('https://imgs.xkcd.com/comics/python.png')
# makes file and write the file in bytes to it
with open('comic.png', 'wb') as f:
f.write(r.content)
"""
Does anyone know why the soup variable continues to work after re-defining the r variable?
My current code is cutting first 6 characters from file names while downloading PDF's. So for example PDF file name is 123456acII.pdf (https://example.com/wp-content/uploads/2016/11/123456acII.pdf) but file in folder is acII.pdf.
How to make names be as they are?
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
main = "https://example.com/"
#If there is no such folder, the script will create one automatically
folder_location = r'C:\temp\webscraping'
if not os.path.exists(folder_location):os.mkdir(folder_location)
def Get_Links():
r = requests.get(main).text
soup = BeautifulSoup(r, 'html.parser')
links = []
for item in soup.findAll("div", {'class': 'large-4 medium-4 columns'}):
for n in item.find_all('a'):
print ('Link: '+ n.get('href'))
links.append(n.get('href'))
return links
def Parse_Links():
pdf = set()
for url in Get_Links():
r = requests.get(url).text
soup = BeautifulSoup(r, 'html.parser')
for item in soup.findAll("div", {'class': 'large-6 medium-8 columns large-centered'}):
for link in item.findAll("a"):
link = link.get("href")
if link:
pdf.add(link)
return pdf
def Save():
for item in Parse_Links():
print(f"Downloading File: {item[55:]}")
filename = os.path.join(folder_location,f"{item[55:]}")
r = requests.get(item)
with open(filename, 'wb') as f:
f.write(r.content)
print("done")
Save()
It looks like you are slicing the string starting at index position 55 {item[55:]}. Try to see if it's simply just starting your index position 6 positions prior:
change to: {item[49:]}
My error:
File "C:/Users/hp dv4/PycharmProjects/project/imagescrap.py", line
22, in
imagefile.write(urllib.request.urlopen(img_src).read())
ValueError: unknown url type: '/img/logo_with_text.png'
I am getting this error while crawling through the specified website whereas, this same code works fine with some other website.
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
i = 1
soup = make_soup("http://ioe.edu.np/")
unique_srcs = []
for img in soup.findAll('img'):
if img.get('src') not in unique_srcs:
unique_srcs.append(img.get('src'))
for img_src in unique_srcs:
filename = str(i)
i = i + 1
imagefile = open(filename + '.png', 'wb')
imagefile.write(urllib.request.urlopen(img_src).read())
imagefile.close()
the above code will encounter one more error.
you are trying to save every file with .png extension, which may make the files unreadable.
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
base_url = "http://ioe.edu.np/"
soup = make_soup(base_url)
unique_srcs = []
for img in soup.findAll('img'):
if img.get('src') not in unique_srcs:
unique_srcs.append(img.get('src'))
for i, img_src in enumerate(unique_srcs):
print(img_src)
filename = str(i)
extension = img_src.split('.')[-1]
with open(filename+'.'+extension, 'wb') as f:
f.write(urllib.request.urlopen(base_url+img_src).read())
few idiomatic python suggestions:
use enumerate instead of trying to manage a counter.
use the with-open construct which takes care of closing your file.
one other thing you could do to further improve:
use a set instead of a list, so that you don't download the same file twice.
As the error messages says:
unknown url type: '/img/logo_with_text.png'
add http://ioe.edu.np/ in front of img_src and it should work
url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
links=[]
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
links.append(a.get('href'))
output = open(outputfile, "wb")
for i in links:
request=urllib.urlopen(i)
read=request.read()
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(outputfile,'r')
zip_ref.extractall('./data/')
zip_ref.close()
I have a url's stored in a list. I am supplying it to urllib. Each url ends with .zip extension. When I run this code I get only the last file downloaded from the list. There are about >400 links to be downloaded.
Am I missing something?
So you write all you files into one, that's not gonna work
Try this
import os
url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
links=[]
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
links.append(a.get('href'))
for i in links:
request=urllib.urlopen(i)
read=request.read()
file_name = os.path.basename(i)
output = open(file_name, "wb")
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(file_name,'r')
zip_ref.extractall('./data/')
zip_ref.close()
Option 2
import os
url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
def download_and_extract(link):
request=urllib.urlopen(link)
read=request.read()
file_name = os.path.basename(link)
output = open(file_name, "wb")
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(file_name,'r')
zip_ref.extractall('./data/')
zip_ref.close()
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
download_and_extract(a.get('href'))