I am using following code to save webpage using Python:
import urllib
import sys
from bs4 import BeautifulSoup
url = 'http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html'
f = urllib.urlretrieve(url,'test.html')
Problem: This code saves html as basic html without javascripts, images etc. I want to save webpage as complete (Like we have option in browser)
Update:
I am using following code now to save all the js/images/css files of webapge so that it can be saved as complete webpage but still my output html is getting saved like basic html:
import pycurl
import StringIO
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
html = b.getvalue()
#print html
fh = open("file.html", "w")
fh.write(html)
fh.close()
Try emulating your browser with selenium. This script will pop up the save as dialog for the webpage. You will still have to figure out how to emulate pressing enter for download to start as the file dialog is out of selenium's reach (how you do it is also OS dependent).
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
br = webdriver.Firefox()
br.get('http://www.google.com/')
save_me = ActionChains(br).key_down(Keys.CONTROL)\
.key_down('s').key_up(Keys.CONTROL).key_up('s')
save_me.perform()
Also I think following #Amber suggestion of grabbing the the linked resources may be a simpler, thus a better solution. Still, I think using selenium is a good starting point as br.page_source will get you the entire dom along with the dynamic content generated by javascript.
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
To get the script above by #rajatomar788 to run, I had to do all of the following imports first:
To run pywebcopy you will need to install the following packages:
pip install pywebcopy
pip install pyquery
pip install w3lib
pip install parse
pip install lxml
After that it worked with a few errors, but I did get the folder filled with the files that make up the webpage.
webpage - INFO - Starting save_assets Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
webpage - Level 100 - Queueing download of <89> asset files.
Exception in thread <Element(LinkTag, file:///++resource++images/favicon2.ico)>:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 312, in run
super(LinkTag, self).run()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 58, in run
self.download_file()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 107, in download_file
req = SESSION.get(url, stream=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\configs.py", line 244, in get
return super(AccessAwareSession, self).get(url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 546, in get
return self.request('GET', url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'file:///++resource++images/favicon2.ico'
webpage - INFO - Starting save_html Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
Try saveFullHtmlPage bellow or adapt it.
Will save a modified *.html and save javascripts, css and images based on the tags script, link and img (tags_inner dict keys) on a folder _files.
import os, sys, re
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def saveFullHtmlPage(url, pagepath='page', session=requests.Session(), html=None):
"""Save web page html and supported contents
* pagepath : path-to-page
It will create a file `'path-to-page'.html` and a folder `'path-to-page'_files`
"""
def savenRename(soup, pagefolder, session, url, tag, inner):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag): # images, css, etc..
if res.has_attr(inner): # check inner tag (file object) MUST exists
try:
filename, ext = os.path.splitext(os.path.basename(res[inner])) # get name and extension
filename = re.sub('\W+', '', filename) + ext # clean special chars from name
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
# rename html ref so can move html and folder of files anywhere
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
if not html:
html = session.get(url).text
soup = BeautifulSoup(html, "html.parser")
path, _ = os.path.splitext(pagepath)
pagefolder = path+'_files' # page contents folder
tags_inner = {'img': 'src', 'link': 'href', 'script': 'src'} # tag&inner tags to grab
for tag, inner in tags_inner.items(): # saves resource files and rename refs
savenRename(soup, pagefolder, session, url, tag, inner)
with open(path+'.html', 'wb') as file: # saves modified html doc
file.write(soup.prettify('utf-8'))
Example saving google.com as google.html and contents on google_files folder. (current folder)
saveFullHtmlPage('https://www.google.com', 'google')
Related
I am trying to gather all images in a specific Directory on my webserver, using BeautifulSoup4.
So far I got this code,
from init import *
from bs4 import BeautifulSoup
import urllib
import urllib.request
# use this image scraper from the location that
#you want to save scraped images to
def make_soup(url):
html = urllib.request.urlopen(url)
return BeautifulSoup(html, features="html.parser")
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
images = [img for img in soup.findAll('img')]
print (str(len(images)) + "images found.")
print ('Downloading images to current working directory.')
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
filename=each.split('/')[-1]
urllib.request.Request(each, filename)
return image_links
#a standard call looks like this
get_images('https://omabilder.000webhostapp.com/img/')
This however, spits out the following error
7images found.
Downloading images to current working directory.
Traceback (most recent call last):
File "C:\Users\MyPC\Desktop\oma projekt\getpics.py", line 1, in <module>
from init import *
File "C:\Users\MyPC\Desktop\oma projekt\init.py", line 9, in <module>
from getpics import *
File "C:\Users\MyPC\Desktop\oma projekt\getpics.py", line 26, in <module>
get_images('https://omabilder.000webhostapp.com/img/')
File "C:\Users\MyPC\Desktop\oma projekt\getpics.py", line 22, in get_images
urllib.request.Request(each, filename)
File "C:\Users\MyPC\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 328, in __init__
self.full_url = url
File "C:\Users\MyPC\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 354, in full_url
self._parse()
File "C:\Users\MyPC\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: '/icons/blank.gif'
What I do not understand is the following,
There is no GIF in the Directory and no /icon/ subdirectory.
Furthermore it spits out 7 images were found, when there are only like 3 uploaded to the website.
The gifs are the icons next to the links on your website (tiny ~20x20 px images). They're actually shown on the website. If I understand correctly, you want to download the png images -- these are links, rather than images at the url you've provided.
If you want to download the png images from the links, then you can use something like this:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import os
# use this image scraper from the location that
#you want to save scraped images to
def make_soup(url):
html = urllib.request.urlopen(url)
return BeautifulSoup(html, features="html.parser")
def get_images(url):
soup = make_soup(url)
# get all links (start with "a")
images = [link["href"] for link in soup.find_all('a', href=True)]
# keep ones that end with png
images = [im for im in images if im.endswith(".png")]
print (str(len(images)) + " images found.")
print ('Downloading images to current working directory.')
#compile our unicode list of image links
for each in images:
urllib.request.urlretrieve(os.path.join(url, each), each)
return images
# #a standard call looks like this
get_images('https://omabilder.000webhostapp.com/img/')
I try converting multiple html file into pdf using pdfkik. This is my code:
from bs4 import BeautifulSoup
from selenium import webdriver
import pdfkit
driver=webdriver.Chrome()
driver.get('https://www.linkedin.com/in/jaypratappandey/')
time.sleep(40)
soup= BeautifulSoup(driver.page_source, 'lxml')
data=[]
f=open('htmlfile.html', 'w')
top=open('tophtmlfile.html', 'w')
for name in soup.select('.pv-top-card-section__body'):
top.write("%s" % name)
for item in soup.select('.pv-oc.ember-view'):
f.write("%s" % item)
pdfkit.from_file(['tophtmlfile.html', 'htmlfile.html'], 'jayprofile.pdf')
driver.quit()
This code give the following error:
Traceback (most recent call last):
File "lkdndata.py", line 23, in <module>
pdfkit.from_file(['tophtmlfile.html', 'htmlfile.html'], 'ankurprofile.pdf')
File "/usr/local/lib/python3.5/dist-packages/pdfkit/api.py", line 49, in from_file
return r.to_pdf(output_path)
File "/usr/local/lib/python3.5/dist-packages/pdfkit/pdfkit.py", line 156, in to_pdf
raise IOError('wkhtmltopdf reported an error:\n' + stderr)
OSError: wkhtmltopdf reported an error:
Error: This version of wkhtmltopdf is build against an unpatched version of QT, and does not support more then one input document.
Exit with code 1, due to unknown error.
The solution i found was to first merge the html files into one and then go on to convert it using pdfkit. so in your case would be to save the tophtml and html files together in same dir and replace the path to that dir.
import pdfkit
import os
# path to folder containing html files
path = "/home/ec2-user/data-science-processes/src/results/"
def multiple_html_to_pdf(path):
""" converts multiple html files to a single pdf
args: path to directory containing html files
"""
empty_html = '<html><head></head><body></body></html>'
for file in os.listdir(path):
if file.endswith(".html"):
print(file)
# append html files
with open(path + file, 'r') as f:
html = f.read()
empty_html = empty_html.replace('</body></html>', html + '</body></html>')
# save merged html
with open('merged.html', 'w') as f:
f.write(empty_html)
pdfkit.from_file('/home/ec2-user/data-science-processes/report/merged.html','Report.pdf')
multiple_html_to_pdf(path)
I had the same error. The error you are probably getting is due to the inconsistency of your qt installation and non availability of compatible qt version.
Try running
wkhtmltopdf
on your terminal and see whether you can find "Reduced Functionality".
If yes then my assumption is correct and then your safest bet would be to compile it from source.
I'm trying to dynamically download pdf's from a web site. I am sure I'm listing them correctly but I am not sure I'm doing the actual file I/O correctly. I get the following error
File "download.py", line 22, in <module>
with open("'"+url+"'", "wb") as pdf:
IOError: [Errno 2] No such file or directory: "'http://www.lcs.mit.edu/publications/pubs/pdf/MIT-LCS-TR-179.pdf'"
Here is my code:
import requests
import re
from bs4 import BeautifulSoup
origin = requests.get("http://freehaven.net/anonbib")
soup=BeautifulSoup(origin.text)
results = soup.find_all(href=re.compile("(http).*(pdf)"))
for link in results:
url = (link.get('href'))
r = requests.get(url)
with open("'"+url+"'", "wb") as pdf:
try:
pdf.write(r.content)
finally:
pdf.close
If url is set to 'http://www.lcs.mit.edu/publications/pubs/pdf/MIT-LCS-TR-179.pdf', your code fails because it is trying to open a file with that name on your filesystem.
Instead, try something like this:
fileForUrl = '/tmp/' + url.split('/')[-1]
with open(fileForUrl, 'wb') as pdf:
# Rest of the code as before
I'm developing an application that runs on an Apache server with Django framework. My current script works fine when it runs on the local desktop (without Django). The script downloads all the images from a website to a folder on the desktop. However, when I run the script on the server a file object is just create by Django that apparently has something in it (should be google's logo), however, I can't open up the file. I also create an html file, updated image link locations, but the html file gets created fine, I'm assuming because it's all text, maybe? I believe I may have to use a file wrapper somewhere, but I'm not sure. Any help is appreciated, below is my code, Thanks!
from django.http import HttpResponse
from bs4 import BeautifulSoup as bsoup
import urlparse
from urllib2 import urlopen
from urllib import urlretrieve
import os
import sys
import zipfile
from django.core.servers.basehttp import FileWrapper
def getdata(request):
out = 'C:\Users\user\Desktop\images'
if request.GET.get('q'):
#url = str(request.GET['q'])
url = "http://google.com"
soup = bsoup(urlopen(url))
parsedURL = list(urlparse.urlparse(url))
for image in soup.findAll("img"):
print "Old Image Path: %(src)s" % image
#Get file name
filename = image["src"].split("/")[-1]
#Get full path name if url has to be parsed
parsedURL[2] = image["src"]
image["src"] = '%s\%s' % (out,filename)
print 'New Path: %s' % image["src"]
# print image
outpath = os.path.join(out, filename)
#retrieve images
if image["src"].lower().startswith("http"):
urlretrieve(image["src"], outpath)
else:
urlretrieve(urlparse.urlunparse(parsedURL), out) #Constructs URL from tuple (parsedURL)
#Create HTML File and writes to it to check output (stored in same directory).
html = soup.prettify("utf-8")
with open("output.html", "wb") as file:
file.write(html)
else:
url = 'You submitted nothing!'
return HttpResponse(url)
My problem had to do with storing the files on the desktop. I stored the files in the DJango workspace folder, changed the paths, and it worked for me.
I am using following code to save webpage using Python:
import urllib
import sys
from bs4 import BeautifulSoup
url = 'http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html'
f = urllib.urlretrieve(url,'test.html')
Problem: This code saves html as basic html without javascripts, images etc. I want to save webpage as complete (Like we have option in browser)
Update:
I am using following code now to save all the js/images/css files of webapge so that it can be saved as complete webpage but still my output html is getting saved like basic html:
import pycurl
import StringIO
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
html = b.getvalue()
#print html
fh = open("file.html", "w")
fh.write(html)
fh.close()
Try emulating your browser with selenium. This script will pop up the save as dialog for the webpage. You will still have to figure out how to emulate pressing enter for download to start as the file dialog is out of selenium's reach (how you do it is also OS dependent).
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
br = webdriver.Firefox()
br.get('http://www.google.com/')
save_me = ActionChains(br).key_down(Keys.CONTROL)\
.key_down('s').key_up(Keys.CONTROL).key_up('s')
save_me.perform()
Also I think following #Amber suggestion of grabbing the the linked resources may be a simpler, thus a better solution. Still, I think using selenium is a good starting point as br.page_source will get you the entire dom along with the dynamic content generated by javascript.
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
To get the script above by #rajatomar788 to run, I had to do all of the following imports first:
To run pywebcopy you will need to install the following packages:
pip install pywebcopy
pip install pyquery
pip install w3lib
pip install parse
pip install lxml
After that it worked with a few errors, but I did get the folder filled with the files that make up the webpage.
webpage - INFO - Starting save_assets Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
webpage - Level 100 - Queueing download of <89> asset files.
Exception in thread <Element(LinkTag, file:///++resource++images/favicon2.ico)>:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 312, in run
super(LinkTag, self).run()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 58, in run
self.download_file()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 107, in download_file
req = SESSION.get(url, stream=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\configs.py", line 244, in get
return super(AccessAwareSession, self).get(url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 546, in get
return self.request('GET', url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'file:///++resource++images/favicon2.ico'
webpage - INFO - Starting save_html Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
Try saveFullHtmlPage bellow or adapt it.
Will save a modified *.html and save javascripts, css and images based on the tags script, link and img (tags_inner dict keys) on a folder _files.
import os, sys, re
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def saveFullHtmlPage(url, pagepath='page', session=requests.Session(), html=None):
"""Save web page html and supported contents
* pagepath : path-to-page
It will create a file `'path-to-page'.html` and a folder `'path-to-page'_files`
"""
def savenRename(soup, pagefolder, session, url, tag, inner):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag): # images, css, etc..
if res.has_attr(inner): # check inner tag (file object) MUST exists
try:
filename, ext = os.path.splitext(os.path.basename(res[inner])) # get name and extension
filename = re.sub('\W+', '', filename) + ext # clean special chars from name
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
# rename html ref so can move html and folder of files anywhere
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
if not html:
html = session.get(url).text
soup = BeautifulSoup(html, "html.parser")
path, _ = os.path.splitext(pagepath)
pagefolder = path+'_files' # page contents folder
tags_inner = {'img': 'src', 'link': 'href', 'script': 'src'} # tag&inner tags to grab
for tag, inner in tags_inner.items(): # saves resource files and rename refs
savenRename(soup, pagefolder, session, url, tag, inner)
with open(path+'.html', 'wb') as file: # saves modified html doc
file.write(soup.prettify('utf-8'))
Example saving google.com as google.html and contents on google_files folder. (current folder)
saveFullHtmlPage('https://www.google.com', 'google')