Twilio Python Module Errors After Compiling - python

I have written a simple program that opens a csv file and text all the numbers in it. I am using Twilio (twilio-python) as a service provider. My code works fine as a python script. However, when I compile the script (using py2exe), the exe file errors. This is the error I receive from the log file....
Traceback (most recent call last):
File "sms.py", line 39, in <module>
File "twilio\rest\resources\messages.pyc", line 112, in create
File "twilio\rest\resources\base.pyc", line 352, in create_instance
File "twilio\rest\resources\base.pyc", line 204, in request
File "twilio\rest\resources\base.pyc", line 129, in make_twilio_request
File "twilio\rest\resources\base.pyc", line 101, in make_request
File "httplib2\__init__.pyc", line 1570, in request
File "httplib2\__init__.pyc", line 1317, in _request
File "httplib2\__init__.pyc", line 1252, in _conn_request
File "httplib2\__init__.pyc", line 1021, in connect
File "httplib2\__init__.pyc", line 80, in _ssl_wrap_socket
File "ssl.pyc", line 387, in wrap_socket
File "ssl.pyc", line 141, in __init__
ssl.SSLError: [Errno 185090050] _ssl.c:340: error:0B084002:x509 certificate
routines:X509_load_cert_crl_file:system lib
I don't recieve this error when i am using the un-compiled code (below)
import sys #2 params --- /path/to/contact/file --- up to 160 char msg
import csv
import time
from twilio.rest import TwilioRestClient
ACCOUNT_SID = "**************************"
AUTH_TOKEN = "**************************"
client = TwilioRestClient(ACCOUNT_SID, AUTH_TOKEN)
sys.argv.pop(0)
contactFile = sys.argv[0]
sys.argv.pop(0)
msg = (' ').join(sys.argv)
print contactFile
print " "
print msg
info = []
with open(contactFile,'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in reader:
info.append(row)
contactCount = len(info)-1
if contactCount > 0:
#remove first item from list because its not a value that is needed....
info.pop(0)
for i in info:
print " "
contactName = i[0]
phoneNumber = i[1]
print "Texting " + contactName + "... \n"
client.messages.create(
to=phoneNumber,
from_="+14782856136",
body=msg
)
time.sleep(1.5)
else:
print("SMSify Error \n The contact file doesn't have any contacts in it.")
Any thoughts on what is going on??
EDIT:
Here is my setup.py file
from distutils.core import setup
import py2exe, sys, os
sys.argv.append('py2exe')
Mydata_files = [('cacert.pem', ['C:\\Python27\\Lib\\site-
packages\\twilio\\conf\\cacert.pem'])]
setup(
console=['sms.py'],
data_files = Mydata_files,
options={
"py2exe":{
"bundle_files": 1,
"compressed": True
}
}
)

It's happened because self-signed certificate file missed in bundle.
This problem is same for requests and httplib2 modules.
For example, if you have a file named req_example.py that using request module:
import requests
url = 'https://google.com/'
requests.get(url)
It works when you run it as python req_example.py, but when bundle it, it's not working.
Or if you have a file named http2_example.py that using http2 module:
import httplib2
url = 'https://google.com/'
http = httplib2.Http()
http.request(url)
It works when you run it as python http2_example.py , but when bundle it, it's not working.
To fix that, you have two options, one bad and one good.
Disable verifying SSL certificates:
To do that for requests module:
import requests
url = 'https://google.com/'
requests.get(url, verify=False)
And for httplib2 module:
import httplib2
http = httplib2.Http(disable_ssl_certificate_validation=True)
http.request(url)
Add self-signed certificate file to bundle:
For requests module, the file cacert.pem is located in:
.../PythonXX/lib/site-packages/requests/cacert.pem
And for httplib2 module is in:
.../PythonXX/lib/site-packages/httplib2/cacerts.txt
For each of them, you can copy it to inside of your project (or just address to it),
And config setup.py for including it:
setup(console=['temp.py'],
# for `requests` module
data_files=['cacert.pem'] ) # or data_files=['cacerts.txt'] ) for `httplib2`
And change your code to using that, for request module:
import os
import requests
url = 'https://google.com/'
cert ='cacert.pem'
# or os.environ['REQUESTS_CA_BUNDLE'] = cert
os.environ['REQUESTS_CA_BUNDLE'] = os.path.join(os.getcwd(), cert)
requests.get(url)
And for httplib2 module:
import httplib2
cert = 'cacerts.txt'
http = httplib2.Http(ca_certs=cert)
http.request(url)
Or if your httplib2 version is 0.8, you can create a file that
should named ca_certs_locater.py, and define a get function,
that return path of ca_certs file.
def get():
return 'cacerts.txt'
Ok, now for your error, and for twilio module, it use httplib2, and cacert.pem of it is in:
.../twilio/conf/cacert.pem
So you need to add this file to setup.py as described above.
But twilio itself has a function with name get_cert_file that pass ca_cert file to httplib2.
I think if you using ca_certs_locater.py that described above, it also will works for that,
but if not, you have yet an ugly option, so you can monkey patch get_cert_file function of twilio:
from twilio.rest.resources.base import get_cert_file
get_cert_file = lambda: 'cacert.pem'
Note that this may an issue for twilio or even for py2exe or PyInstaller .

I had the same problem with twilio and pyinstaller, and was able to fix it by modifying the base.py module in twilio\rest\resources:
def get_cert_file():
""" Get the cert file location or bail """
# XXX - this currently fails test coverage because we don't actually go
# over the network anywhere. Might be good to have a test that stands up a
# local server and authenticates against it.
try:
# Apparently __file__ is not available in all places so wrapping this
# in a try/catch
current_path = os.path.realpath(__file__)
#ca_cert_path = os.path.join(current_path, "..", "..", "..", (old path)
# "conf", "cacert.pem")
ca_cert_path = os.getcwd() + '\Config\cacert.pem' (my new path)
return os.path.abspath(ca_cert_path)
(I am storing my cacert.pem file in a Config folder off my main script directory)

There should probably be a way for py2exe to bundle up non-Python files, like templates or in this case the SSL certificate stored in cacert.pem. Normally this is done automatically with the MANIFEST.in, but I am not sure how that project handles it. Check the documentation there for more information.

Related

Cannot access file on Samba server via Python

I'm trying to access a file on our Samba server using Python. I found out I need to use a Samba client for this, so I started using PySmbClient. Even though there are many examples online of how to do this, mine just does not want to work. See below.
smb = smbclient.SambaClient(server="192.168.0.320", share="DATA", domain="WORKGROUP",username="admin", password="abc123")
f = smb.open('test.json', 'r')
This produces the following error:
OSError: [Errno 2] No such file or directory
with the following trace:
Traceback (most recent call last):
File "create_dataset.py", line 35, in <module>
f = smb.open('serverSaver.txt', 'r')
File "/home/grant/Development/create_dataset/env/local/lib/python2.7/site-packages/smbclient.py", line 408, in open
f = _SambaFile(self, path, mode)
File "/home/grant/Development/create_dataset/env/local/lib/python2.7/site-packages/smbclient.py", line 448, in __init__
connection.download(remote_name, self._tmp_name)
File "/home/grant/Development/create_dataset/env/local/lib/python2.7/site-packages/smbclient.py", line 393, in download
result = self._runcmd('get', remote_path, local_path)
File "/home/grant/Development/create_dataset/env/local/lib/python2.7/site-packages/smbclient.py", line 184, in _runcmd
return self._raw_runcmd(fullcmd)
File "/home/grant/Development/create_dataset/env/local/lib/python2.7/site-packages/smbclient.py", line 168, in _raw_runcmd
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
File "/usr/lib/python2.7/subprocess.py", line 711, in __init__
errread, errwrite)
File "/usr/lib/python2.7/subprocess.py", line 1343, in _execute_child
raise child_exception
I've read and implemented many "solutions", but so far nothing has worked for me. I can access the Samba server with the given credentials through my file manager just fine, so I know those values should be fine. I even spoke to our sys admin and he doesn't know what could be wrong.
It must be more than the simple code I wrote. Do you think there's an issue on the server side of things? Something with the values I input into SambaClient? At this point I'm pretty much open to anything that leads to a solution.
Here's some code that works for me, transferring a file from a Linux Samba share to my Windows laptop. It's also known to work fine in the other direction (Linux client, Windows server).
I'm using the pysmb library version 1.1.19 (the latest) and Python 2.7.1.
See the pysmb site for the pysmb package; I actually downloaded and installed it directly from its tarball and setup.py, as pip was throwing an error.
The pysmb package is less user-friendly but it does work well for Windows clients.
I set up a share called "my_share" on the Linux machine for user "edwards" using the following entry in smb.conf:
[my_share]
path = /home/edwards
valid_users = edwards
read only = no
guest ok = yes
browseable = yes
And then used the following code to list the files on the share, and download a file called "rti_license.dat" to my laptop:
import tempfile
import smb
import shutil
from smb.SMBConnection import SMBConnection
share_name = "my_share"
user_name = "edwards"
password = "######" # secret :-)
local_machine_name = "laptop" # arbitrary
server_machine_name = "edwards-Yocto" # MUST match correctly
server_IP = "192.162.2.1" # as must this
# create and establish connection
conn = SMBConnection(user_name, password, local_machine_name, server_machine_name, use_ntlm_v2 = True)
assert conn.connect(server_IP, 139)
# print list of files at the root of the share
files = conn.listPath(share_name, "/")
for item in files:
print item.filename
# check if the file we want is there
sf = conn.getAttributes(share_name, "rti_license.dat")
print sf.file_size
print sf.filename
# create a temporary file for the transfer
file_obj = tempfile.NamedTemporaryFile(mode='w+t', delete=False)
file_name = file_obj.name
file_attributes, copysize = conn.retrieveFile(share_name, "rti_license.dat", file_obj)
print copysize
file_obj.close()
# copy temporary file
shutil.copy(file_name, "rti_license.dat")
# close connection
conn.close()
Note that the server name must be correct or the connection won't work (from a Linux machine it's the output of the hostname command)
Hope this may be useful.

Writing to HTML-File - Opened file looks weird [duplicate]

I am using following code to save webpage using Python:
import urllib
import sys
from bs4 import BeautifulSoup
url = 'http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html'
f = urllib.urlretrieve(url,'test.html')
Problem: This code saves html as basic html without javascripts, images etc. I want to save webpage as complete (Like we have option in browser)
Update:
I am using following code now to save all the js/images/css files of webapge so that it can be saved as complete webpage but still my output html is getting saved like basic html:
import pycurl
import StringIO
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
html = b.getvalue()
#print html
fh = open("file.html", "w")
fh.write(html)
fh.close()
Try emulating your browser with selenium. This script will pop up the save as dialog for the webpage. You will still have to figure out how to emulate pressing enter for download to start as the file dialog is out of selenium's reach (how you do it is also OS dependent).
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
br = webdriver.Firefox()
br.get('http://www.google.com/')
save_me = ActionChains(br).key_down(Keys.CONTROL)\
.key_down('s').key_up(Keys.CONTROL).key_up('s')
save_me.perform()
Also I think following #Amber suggestion of grabbing the the linked resources may be a simpler, thus a better solution. Still, I think using selenium is a good starting point as br.page_source will get you the entire dom along with the dynamic content generated by javascript.
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
To get the script above by #rajatomar788 to run, I had to do all of the following imports first:
To run pywebcopy you will need to install the following packages:
pip install pywebcopy
pip install pyquery
pip install w3lib
pip install parse
pip install lxml
After that it worked with a few errors, but I did get the folder filled with the files that make up the webpage.
webpage - INFO - Starting save_assets Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
webpage - Level 100 - Queueing download of <89> asset files.
Exception in thread <Element(LinkTag, file:///++resource++images/favicon2.ico)>:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 312, in run
super(LinkTag, self).run()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 58, in run
self.download_file()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 107, in download_file
req = SESSION.get(url, stream=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\configs.py", line 244, in get
return super(AccessAwareSession, self).get(url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 546, in get
return self.request('GET', url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'file:///++resource++images/favicon2.ico'
webpage - INFO - Starting save_html Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
Try saveFullHtmlPage bellow or adapt it.
Will save a modified *.html and save javascripts, css and images based on the tags script, link and img (tags_inner dict keys) on a folder _files.
import os, sys, re
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def saveFullHtmlPage(url, pagepath='page', session=requests.Session(), html=None):
"""Save web page html and supported contents
* pagepath : path-to-page
It will create a file `'path-to-page'.html` and a folder `'path-to-page'_files`
"""
def savenRename(soup, pagefolder, session, url, tag, inner):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag): # images, css, etc..
if res.has_attr(inner): # check inner tag (file object) MUST exists
try:
filename, ext = os.path.splitext(os.path.basename(res[inner])) # get name and extension
filename = re.sub('\W+', '', filename) + ext # clean special chars from name
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
# rename html ref so can move html and folder of files anywhere
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
if not html:
html = session.get(url).text
soup = BeautifulSoup(html, "html.parser")
path, _ = os.path.splitext(pagepath)
pagefolder = path+'_files' # page contents folder
tags_inner = {'img': 'src', 'link': 'href', 'script': 'src'} # tag&inner tags to grab
for tag, inner in tags_inner.items(): # saves resource files and rename refs
savenRename(soup, pagefolder, session, url, tag, inner)
with open(path+'.html', 'wb') as file: # saves modified html doc
file.write(soup.prettify('utf-8'))
Example saving google.com as google.html and contents on google_files folder. (current folder)
saveFullHtmlPage('https://www.google.com', 'google')

List Google Cloud Storage buckets using Python

I'm following this tutorial: https://developers.google.com/storage/docs/gspythonlibrary and getting couple of errors while trying to list my buckets.
I've downloaded gsutil and added it to my PYTHONPATH which looks like this:
/home/nicolasalvo/tools/gsutil/third_party/boto:/home/nicolasalvo/tools/gsutil
I've also executed:
pip install -U oauth2client
The code I'm trying to run is:
import StringIO
import os
import shutil
import tempfile
import time
from gslib.third_party.oauth2_plugin import oauth2_plugin
import boto
# URI scheme for Google Cloud Storage.
GOOGLE_STORAGE = 'gs'
# URI scheme for accessing local files.
LOCAL_FILE = 'file'
uri = boto.storage_uri('', GOOGLE_STORAGE)
for bucket in uri.get_all_buckets():
print bucket.name
The first error I've got is:
File "<stdin>", line 1, in <module>
File "/home/nicolasalvo/tools/gsutil/gslib/third_party/oauth2_plugin/oauth2_plugin.py", line 3, in <module>
import oauth2_client
File "/home/nicolasalvo/tools/gsutil/gslib/third_party/oauth2_plugin/oauth2_client.py", line 33, in <module>
import socks
Which I've fixed manually changing:
import socks
to be
import httplib2.socks
Now the error I'm facing is:
File "/home/nicolasalvo/tools/gsutil/third_party/boto/boto/connection.py", line 876, in _mexe
request.authorize(connection=self)
File "/home/nicolasalvo/tools/gsutil/third_party/boto/boto/connection.py", line 377, in authorize
connection._auth_handler.add_auth(self, **kwargs)
File "/home/nicolasalvo/tools/gsutil/gslib/third_party/oauth2_plugin/oauth2_plugin.py", line 22, in add_auth
self.oauth2_client.GetAuthorizationHeader()
File "/home/nicolasalvo/tools/gsutil/gslib/third_party/oauth2_plugin/oauth2_client.py", line 325, in GetAuthorizationHeader
return 'Bearer %s' % self.GetAccessToken().token
File "/home/nicolasalvo/tools/gsutil/gslib/third_party/oauth2_plugin/oauth2_client.py", line 288, in GetAccessToken
token_exchange_lock.acquire()
NameError: global name 'token_exchange_lock' is not defined
I've tried to declare the global object before using it, but it hasn't helped.
Also I would expect that I should be able to use Google provided libraries for Cloud Storage without requiring manual fixes.
I'm running Python 2.7.3
Any help is greatly appreciated
This worked for me:
import threading
from gslib.third_party.oauth2_plugin import oauth2_client
oauth2_client.token_exchange_lock = threading.Lock()
import StringIO
import os
import shutil
import tempfile
import time
import threading
import boto
from gslib.third_party.oauth2_plugin import oauth2_plugin
from gslib.third_party.oauth2_plugin import oauth2_client
oauth2_client.token_exchange_lock = threading.Lock()
# URI scheme for Google Cloud Storage.
GOOGLE_STORAGE = 'gs'
# URI scheme for accessing local files.
LOCAL_FILE = 'file'
project_id = 'abc.com:abc'
uri = boto.storage_uri('', GOOGLE_STORAGE)
header_values = {"x-goog-project-id": project_id}
for bucket in uri.get_all_buckets(headers=header_values):
print bucket.name
Yes it works!!

Read a file in buffer from FTP python

I am trying to read a file from an FTP server. The file is a .gz file. I would like to know if I can perform actions on this file while the socket is open. I tried to follow what was mentioned in two StackOverflow questions on reading files without writing to disk and reading files from FTP without downloading but was not successful.
I know how to extract data/work on the downloaded file but I'm not sure if I can do it on the fly. Is there a way to connect to the site, get data in a buffer, possibly do some data extraction and exit?
When trying StringIO I got the error:
>>> from ftplib import FTP
>>> from StringIO import StringIO
>>> ftp = FTP('ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/PMC-ids.csv.gz')
Traceback (most recent call last):
File "<pyshell#2>", line 1, in <module>
ftp = FTP('ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/PMC-ids.csv.gz')
File "C:\Python27\lib\ftplib.py", line 117, in __init__
self.connect(host)
File "C:\Python27\lib\ftplib.py", line 132, in connect
self.sock = socket.create_connection((self.host, self.port), self.timeout)
File "C:\Python27\lib\socket.py", line 553, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
gaierror: [Errno 11004] getaddrinfo failed
I just need to know how can I get data into some variable and loop on it until the file from FTP is read.
I appreciate your time and help. Thanks!
Make sure to login to the ftp server first. After this, use retrbinary which pulls the file in binary mode. It uses a callback on each chunk of the file. You can use this to load it into a string.
from ftplib import FTP
ftp = FTP('ftp.ncbi.nlm.nih.gov')
ftp.login() # Username: anonymous password: anonymous#
# Setup a cheap way to catch the data (could use StringIO too)
data = []
def handle_binary(more_data):
data.append(more_data)
resp = ftp.retrbinary("RETR pub/pmc/PMC-ids.csv.gz", callback=handle_binary)
data = "".join(data)
Bonus points: how about we decompress the string while we're at it?
Easy mode, using data string above
import gzip
import StringIO
zippy = gzip.GzipFile(fileobj=StringIO.StringIO(data))
uncompressed_data = zippy.read()
Little bit better, full solution:
from ftplib import FTP
import gzip
import StringIO
ftp = FTP('ftp.ncbi.nlm.nih.gov')
ftp.login() # Username: anonymous password: anonymous#
sio = StringIO.StringIO()
def handle_binary(more_data):
sio.write(more_data)
resp = ftp.retrbinary("RETR pub/pmc/PMC-ids.csv.gz", callback=handle_binary)
sio.seek(0) # Go back to the start
zippy = gzip.GzipFile(fileobj=sio)
uncompressed = zippy.read()
In reality, it would be much better to decompress on the fly but I don't see a way to do that with the built in libraries (at least not easily).
There are two easy ways I can think of to download a file using FTP and store it locally:
Using ftplib:
from ftplib import FTP
ftp = FTP('ftp.ncbi.nlm.nih.gov')
ftp.login()
ftp.cwd('pub/pmc')
ftp.retrbinary('RETR PMC-ids.csv.gz', open('PMC-ids.csv.gz', 'wb').write)
ftp.quit()
Using urllib
from urllib import urlretrieve
urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/PMC-ids.csv.gz", "PMC-ids.csv.gz")
If you don't want to download and store it to a file, but you want to process it gradually as it comes, I suggest using urllib2:
from urllib2 import urlopen
u = urlopen("ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/readme.txt")
for line in u:
print line
which prints your file line by line.
That is not possible. To process data on the server, you need to have some sort of execution permissions, be it for a shell script you would send or SQL access.
FTP is pure file transfer, no execution allowed. You will need either to enable SSH access, load the data into a Database and access that with queries or download the file with urllib then process it locally, like this:
import urllib
handle = urllib.urlopen('ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/PMC-ids.csv.gz')
# Use data, maybe: buffer = handle.read()
In particular, I think the third one is the only zero-effort solution.

How to save "complete webpage" not just basic html using Python

I am using following code to save webpage using Python:
import urllib
import sys
from bs4 import BeautifulSoup
url = 'http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html'
f = urllib.urlretrieve(url,'test.html')
Problem: This code saves html as basic html without javascripts, images etc. I want to save webpage as complete (Like we have option in browser)
Update:
I am using following code now to save all the js/images/css files of webapge so that it can be saved as complete webpage but still my output html is getting saved like basic html:
import pycurl
import StringIO
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.vodafone.de/privat/tarife/red-smartphone-tarife.html")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
html = b.getvalue()
#print html
fh = open("file.html", "w")
fh.write(html)
fh.close()
Try emulating your browser with selenium. This script will pop up the save as dialog for the webpage. You will still have to figure out how to emulate pressing enter for download to start as the file dialog is out of selenium's reach (how you do it is also OS dependent).
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
br = webdriver.Firefox()
br.get('http://www.google.com/')
save_me = ActionChains(br).key_down(Keys.CONTROL)\
.key_down('s').key_up(Keys.CONTROL).key_up('s')
save_me.perform()
Also I think following #Amber suggestion of grabbing the the linked resources may be a simpler, thus a better solution. Still, I think using selenium is a good starting point as br.page_source will get you the entire dom along with the dynamic content generated by javascript.
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
To get the script above by #rajatomar788 to run, I had to do all of the following imports first:
To run pywebcopy you will need to install the following packages:
pip install pywebcopy
pip install pyquery
pip install w3lib
pip install parse
pip install lxml
After that it worked with a few errors, but I did get the folder filled with the files that make up the webpage.
webpage - INFO - Starting save_assets Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
webpage - Level 100 - Queueing download of <89> asset files.
Exception in thread <Element(LinkTag, file:///++resource++images/favicon2.ico)>:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 312, in run
super(LinkTag, self).run()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 58, in run
self.download_file()
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\elements.py", line 107, in download_file
req = SESSION.get(url, stream=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\pywebcopy\configs.py", line 244, in get
return super(AccessAwareSession, self).get(url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 546, in get
return self.request('GET', url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for 'file:///++resource++images/favicon2.ico'
webpage - INFO - Starting save_html Action on url: 'http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html'
Try saveFullHtmlPage bellow or adapt it.
Will save a modified *.html and save javascripts, css and images based on the tags script, link and img (tags_inner dict keys) on a folder _files.
import os, sys, re
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def saveFullHtmlPage(url, pagepath='page', session=requests.Session(), html=None):
"""Save web page html and supported contents
* pagepath : path-to-page
It will create a file `'path-to-page'.html` and a folder `'path-to-page'_files`
"""
def savenRename(soup, pagefolder, session, url, tag, inner):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag): # images, css, etc..
if res.has_attr(inner): # check inner tag (file object) MUST exists
try:
filename, ext = os.path.splitext(os.path.basename(res[inner])) # get name and extension
filename = re.sub('\W+', '', filename) + ext # clean special chars from name
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
# rename html ref so can move html and folder of files anywhere
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
if not html:
html = session.get(url).text
soup = BeautifulSoup(html, "html.parser")
path, _ = os.path.splitext(pagepath)
pagefolder = path+'_files' # page contents folder
tags_inner = {'img': 'src', 'link': 'href', 'script': 'src'} # tag&inner tags to grab
for tag, inner in tags_inner.items(): # saves resource files and rename refs
savenRename(soup, pagefolder, session, url, tag, inner)
with open(path+'.html', 'wb') as file: # saves modified html doc
file.write(soup.prettify('utf-8'))
Example saving google.com as google.html and contents on google_files folder. (current folder)
saveFullHtmlPage('https://www.google.com', 'google')

Categories

Resources