I ran the codes below. Most of the codes work, but when I ran the "for elm in collect" block, I got an error: HTTPError: HTTP Error 403: Forbidden. Can anyone help with this? Thanks!!
import requests
from bs4 import BeautifulSoup
import urllib.request
import os
resp = requests.get('https://www.williams.edu/institutional-research/common-data-set/',
headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(resp.text, 'html5lib')
links = [a['href'] for a in soup.select('li a[href]')]
collect = []
for link in links:
if "https://www.williams.edu/institutional-research/files/" in link:
collect.append(link)
for elm in collect:
def main():
download_file(elm) # the elm is an url.
def download_file(download_url): # the download_url is the elm.
save_path = 'C:/Users/WM'
file_name = elm.split("/")[-1]
complete_name = os.path.join(save_path, file_name)
response = urllib.request.urlopen(download_url)
file = open(complete_name, 'wb')
file.write(response.read())
file.close()
print("Completed")
if __name__ == "__main__":
main()
Not sure why there is a mixed use of requests and urllib in your code - Just request the download_url in loop as you do with initial url and add some header:
response = requests.get(download_url, headers={'User-Agent': 'Mozilla/5.0'})
Example
import requests
from bs4 import BeautifulSoup
import os
resp = requests.get('https://www.williams.edu/institutional-research/common-data-set/',
headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(resp.text, 'html5lib')
links = [a['href'] for a in soup.select('li a[href]')]
collect = []
for link in links:
if "https://www.williams.edu/institutional-research/files/" in link:
collect.append(link)
for elm in collect:
def main():
download_file(elm) # the elm is an url.
def download_file(download_url): # the download_url is the elm.
save_path = 'C:/Users/WM'
file_name = elm.split("/")[-1]
complete_name = os.path.join(save_path, file_name)
response = requests.get(download_url, headers={'User-Agent': 'Mozilla/5.0'})
file = open(complete_name, 'wb')
file.write(response.read())
file.close()
print("Completed")
if __name__ == "__main__":
main()
Related
I am trying to download multiple .csv files from an url directory with similar names (AB_daily.csv, BC_daily.csv, etc.). However, each file is stored in different folders in the directory. I know there is a way to use a loop to extract the files, but I can't figure out how to do it with Beautiful Soup or glob. Do you have any suggestions? I've also used pandas.read_csv() to look for shortcuts as I'm just trying to concatenate the files together later. Thank you.
URL Directory: https://dd.weather.gc.ca/hydrometric/csv/
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [f"{url[:20]}{item['href']}" for item in soup.select(
"a[href$='AB_daily_hydrometric.csv']")
for x in target:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv')
# For this specific task this will work:
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
csv_links = []
links = ["AB/", "BC/", "MB/", "NB/", "NL/", "NS/", "NT/", "NU/" "ON/", " PE/", "QC/", "SK/", "YT/"]
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for item in links:
r_daily = BeautifulSoup(req.get(url + item + "daily/").content, 'html.parser')
r_hourly = BeautifulSoup(req.get(url + item + "hourly/").content, 'html.parser')
for item_d in r_daily.find_all('a'):
if ".csv" in item_d.get('href'):
csv_links.append(url + item + "daily/" +item_d.get('href'))
for item_h in r_hourly.find_all('a'):
if ".csv" in item_h.get('href'):
csv_links.append(url + item + "hourly/" + item_h.get('href'))
for x in csv_links:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv/')
This script uses beautiful soup to parse all pdf documents on a particular page of a web site. The script successfully downloads one file but will not download all the files that are returned. I need help making this download all pdf documents that I have parsed.
I have done research but have found no answers
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx? portalId=895956&pageId=1606144')
RFP_Import = ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')
def get_pdf_links():
r = requests.get(RFP_Import)
soup= BeautifulSoup(r.content, 'html5lib')
links = soup.find_all('a')
pdf_links = [place_hoder + link['href'] for link in links if link['href'].endswith('pdf')]
return pdf_links
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
print ('all RFPs downloaded!')
return
if __name__ == "__main__":
pdf_links = get_pdf_links()
download_pdf_links(pdf_links)
Successfully downloads first pdf document and then stops.
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx? portalId=895956&pageId=1606144')
RFP_Import = ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')
def get_pdf_links():
r = requests.get(RFP_Import)
soup= BeautifulSoup(r.content, 'html5lib')
links = soup.find_all('a')
pdf_links = [place_hoder + link['href'] for link in links if link['href'].endswith('pdf')]
return pdf_links
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
print ('all RFPs downloaded!')
return
if __name__ == "__main__":
pdf_links = get_pdf_links()
download_pdf_links(pdf_links)
Inside download_pdf_links(), return is misaligned. It should be aligned with for. Otherwise, it is part of the for cycle and the function terminates after the first iteration.
This is probably also true for print ('all RFPs downloaded!'). I guess you want that printed out at the end of the for cycle, after you have been through all links.
In download_pdf_link you are using return inside your loop, which will return after the first iteration of the loop and stop downloading files. You need to return after the loop finishes by putting it on the same indentation as the start of the loop like this:
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
# Un-indented so it happens after the loop finishes.
print ('all RFPs downloaded!')
return
My error:
File "C:/Users/hp dv4/PycharmProjects/project/imagescrap.py", line
22, in
imagefile.write(urllib.request.urlopen(img_src).read())
ValueError: unknown url type: '/img/logo_with_text.png'
I am getting this error while crawling through the specified website whereas, this same code works fine with some other website.
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
i = 1
soup = make_soup("http://ioe.edu.np/")
unique_srcs = []
for img in soup.findAll('img'):
if img.get('src') not in unique_srcs:
unique_srcs.append(img.get('src'))
for img_src in unique_srcs:
filename = str(i)
i = i + 1
imagefile = open(filename + '.png', 'wb')
imagefile.write(urllib.request.urlopen(img_src).read())
imagefile.close()
the above code will encounter one more error.
you are trying to save every file with .png extension, which may make the files unreadable.
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
base_url = "http://ioe.edu.np/"
soup = make_soup(base_url)
unique_srcs = []
for img in soup.findAll('img'):
if img.get('src') not in unique_srcs:
unique_srcs.append(img.get('src'))
for i, img_src in enumerate(unique_srcs):
print(img_src)
filename = str(i)
extension = img_src.split('.')[-1]
with open(filename+'.'+extension, 'wb') as f:
f.write(urllib.request.urlopen(base_url+img_src).read())
few idiomatic python suggestions:
use enumerate instead of trying to manage a counter.
use the with-open construct which takes care of closing your file.
one other thing you could do to further improve:
use a set instead of a list, so that you don't download the same file twice.
As the error messages says:
unknown url type: '/img/logo_with_text.png'
add http://ioe.edu.np/ in front of img_src and it should work
I use this code to download jpg file without any problem. But as you can see the following page source include a lot of image with path blank.gif.
<a href="/en/chowchow-puppy-sleeping-dogs-pet-448311/"><img src="/static/img/blank.gif"
My question: Is it possible to add a detect function when it is blank.gif then auto download image file with 640*426 from "https://pixabay.com/en/chowchow-puppy-sleeping-dogs-pet-448311/" and how to archive ??
import random
import requests
from bs4 import BeautifulSoup
# got from http://stackoverflow.com/a/16696317
def download_file(url):
local_filename = url.split('/')[-1]
print("Downloading {} ---> {}".format(url, local_filename))
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return local_filename
def Download_Image_from_Web(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('img'):
image_links = link.get('src')
if not image_links.startswith('http'):
image_links = url + '/' + image_links
download_file(image_links)
Download_Image_from_Web("https://pixabay.com/en/photos/?q=sleeping+puppy&hp=&image_type=&cat=&min_width=&min_height=")
Updated version. Read comments for additional info.
import random
import requests
from bs4 import BeautifulSoup
# got from http://stackoverflow.com/a/16696317
def download_file(url):
local_filename = url.split('/')[-1]
print("Downloading {} ---> {}".format(url, local_filename))
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return local_filename
def Download_Image_from_Web(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('img'):
image_links = link.get('src')
if image_links.endswith('blank.gif'):
image_links = link.get('data-lazy')
if not image_links.startswith('http'):
image_links = url + '/' + image_links
download_file(image_links)
Download_Image_from_Web("https://pixabay.com/en/photos/?q=sleeping+puppy&hp=&image_type=&cat=&min_width=&min_height=")
I wrote a simple Python scraper to grab some documents from a specific page on nytimes.com. It works, in the sense that it grabs, and formats all the URL's correctly, and attempts to download the files, and formats the name correctly.
But all I get are 1kb files. I can't figure out why. Here is my code:
import urllib2
import urllib
from cookielib import CookieJar
files = 'http://www.nytimes.com/interactive/2014/11/25/us/evidence-released-in-michael-brown-case.html?_r=0'
slashpos = 0
def getLinks(url):
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
p = opener.open(url)
result = []
for line in p:
for element in line.split():
if element.startswith('href="http://gr'):
if element.endswith('pdf"') or element.endswith('png"') or element.endswith('jpg"'):
result.append(element[6:])
else:
continue
for char in result:
slashpos = char.rfind('/') + 1
urllib.urlretrieve(char, char[slashpos:-1])
getLinks(files)
Any and all help is appreciated. Thanks!
1) use result.append(element[6:-1]) instead of result.append(element[6:]) (Avoids having double quotes in the url, the reason why downloads fail)
2) and for saving the file use urllib.urlretrieve(char, char[slashpos:]) instead of urllib.urlretrieve(char, char[slashpos:-1])
Solved !! :D
#!/usr/bin/env python
from bs4 import BeautifulSoup
import urllib2
import urlparse
from sys import argv
from cookielib import CookieJar
if len(argv) != 2:
print "Usage:\n\tpython %s 'http://www.nytimes.com/interactive/2014/11/25/us/evidence-released-in-michael-brown-case.html?_r=0'"%argv[0]
exit()
url = argv[1]
urls =[]
try:
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
html = opener.open(url)
except:
print "[-] No such website"
exit()
soup = BeautifulSoup(html)
for tag in soup.find_all('a'):
try:
tag["href"] = urlparse.urljoin(url, tag['href'])
if tag['href'] not in urls and '.png' in tag['href'] or '.jpg' in tag['href']:
newpdf = tag['href'].split("/")
name = newpdf[-1]
resp = urllib2.urlopen(tag['href'])
meta_data = resp.info()
fsize = int(meta_data.getheaders("Content-Length")[0])
print "Downloading --> %s \t size: %s "%(name, fsize)
f = open(name, "wb")
f.write(resp.read())
f.close
urls.append(tag["href"])
else:
print tag['href']
except KeyboardInterrupt:
print " User hit CTRL+C"
exit()
except:
pass
Hopefully it would be helpful for you