I'm trying to implement a python script that will compare the last modified dates of a local and remotely hosted file.
If the remote file is newer it should:
- delete the local file
- download the remote file with the last modified date intact
The closest answer I've found to this is Last Modified of file downloaded does not match its HTTP header, however I believe this downloads the whole file, so doesn't save much resource/time
What I'd like to do is just review the remote file's headers rather than download the whole file which I believe should be quicker.
Here's my current code, which is very messy and noobish (see string replace etc) I'm sure there's a better/quicker way - what can you suggest?
remote_source = 'http://example.com/somefile.xml'
local_source = 'path/to/myfile.xml'
if path.exists(local_source):
local_source_last_modified = os.path.getmtime(local_source)
local_source_last_modified = datetime.datetime.fromtimestamp(local_source_last_modified).strftime('(%Y, %m, %d, %H, %M, %S)')
conn = urllib.urlopen(remote_source)
remote_source_last_modified = conn.info().getdate('last-modified')
remote_source_last_modified = str(remote_source_last_modified)
remote_source_last_modified = remote_source_last_modified.replace(", 0, 1, 0)", ")")
if local_source_last_modified < remote_source_last_modified:
pass
else:
headers = urlretrieve(remote_source, local_source)[1]
lmStr = headers.getheader("Last-Modified")
remote_source_last_modified = mktime(strptime(lmStr, "%a, %d %b %Y %H:%M:%S GMT"))
os.utime(local_source, (remote_source_last_modified, remote_source_last_modified))
else:
headers = urlretrieve(remote_source, local_source)[1]
lmStr = headers.getheader("Last-Modified")
remote_source_last_modified = mktime(strptime(lmStr, "%a, %d %b %Y %H:%M:%S GMT"))
os.utime(local_source, (remote_source_last_modified, remote_source_last_modified))
Just in case anybody reads this, here's what I ended up with:
def syncCheck(file_path):
remote_source = 'http://example.com/' + os.path.basename(file_path)
local_source = file_path
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
response = requests.head(remote_source, headers = headers)
remote_source_last_modified = response.headers["last-modified"]
remote_source_last_modified = time.mktime(datetime.datetime.strptime(remote_source_last_modified[:-4], "%a, %d %b %Y %H:%M:%S").timetuple())
try:
if os.path.exists(local_source):
local_source_last_modified = os.path.getmtime(local_source)
if local_source_last_modified == remote_source_last_modified:
break
else:
try:
os.remove(local_source)
except:
break
urlretrieve(remote_source, local_source)
os.utime(local_source, (remote_source_last_modified, remote_source_last_modified))
else:
urlretrieve(remote_source, local_source)
os.utime(local_source, (remote_source_last_modified, remote_source_last_modified))
except HTTPError, e:
print("HTTP Error: " + str(e.fp.read()))
except URLError, e:
print("URL Error: " + str(e.reason))
Related
I am completing the 'Python for everybody' course on coursera. I am stuck on the 'Mailing List Data - Part I'
I have the following code below:
import sys
import sqlite3
import time
import ssl
from urllib import request
from urllib.parse import urljoin
from urllib.parse import urlparse
import re
from datetime import datetime, timedelta
# Not all systems have this so conditionally define parser
try:
import dateutil.parser as parser
except:
pass
def parsemaildate(md):
# See if we have dateutil
try:
pdate = parser.parse(tdate)
test_at = pdate.isoformat()
return test_at
except:
pass
# Non-dateutil version - we try our best
pieces = md.split()
notz = " ".join(pieces[:4]).strip()
# Try a bunch of format variations - strptime() is *lame*
dnotz = None
for form in ['%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
'%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
'%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M']:
try:
dnotz = datetime.strptime(notz, form)
break
except:
continue
if dnotz is None:
# print 'Bad Date:',md
return None
iso = dnotz.isoformat()
tz = "+0000"
try:
tz = pieces[4]
ival = int(tz) # Only want numeric timezone values
if tz == '-0000': tz = '+0000'
tzh = tz[:3]
tzm = tz[3:]
tz = tzh + ":" + tzm
except:
pass
return iso + tz
conn = sqlite3.connect('emreyavuzher.sqlite')
cur = conn.cursor()
conn.text_factory = str
baseurl = "http://mbox.dr-chuck.net/sakai.devel/"
cur.execute('''CREATE TABLE IF NOT EXISTS Messages
(id INTEGER UNIQUE, email TEXT, sent_at TEXT,
subject TEXT, headers TEXT, body TEXT)''')
start = 0
cur.execute('SELECT max(id) FROM Messages')
try:
row = cur.fetchone()
if row[0] is not None:
start = row[0]
except:
start = 0
row = None
print(start)
many = 0
# Skip up to five messages
skip = 5
while True:
if (many < 1):
sval = input('How many messages:')
if (len(sval) < 1): break
many = int(sval)
start = start + 1
cur.execute('SELECT id FROM Messages WHERE id=?', (start,))
try:
row = cur.fetchone()
if row is not None: continue
except:
row = None
many = many - 1
url = baseurl + str(start) + '/' + str(start + 1)
try:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
document = request.urlopen(url)
text = document.read()
if document.getcode() != 200:
print("Error code=", document.getcode(), url)
break
except KeyboardInterrupt:
print('')
print('Program interrupted by user...')
break
except:
print("Unable to retrieve or parse page", url)
print(sys.exc_info()[0])
break
print(url, len(text))
if not text.startswith('From '):
if skip < 1:
print(text)
print("End of mail stream reached...")
quit()
print("Skipping badly formed message")
skip = skip - 1
continue
However, the code keeps giving me the error: Traceback (most recent call last):
File "", line 128, in
TypeError: startswith first arg must be bytes or a tuple of bytes, not str
Would anybody be able to give me a helping hand?
This is the code I am using for downloading the images from Google page. This code is taking time in Evaluating and downloading the images. Hence, I thought of using the Beautifulsoup Library for faster evaluation and download. Check the below original code:
import time
import sys
import os
import urllib2
search_keyword = ['Australia']
keywords = [' high resolution']
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
page = response.read()
return page
except:
return"Page Not found"
def _images_get_next_item(s):
start_line = s.find('rg_di')
if start_line == -1:
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
def _images_get_all_items(page):
items = []
while True:
item, end_content = _images_get_next_item(page)
if item == "no_links":
break
else:
items.append(item)
time.sleep(0.1)
page = page[end_content:]
return items
t0 = time.time()
i= 0
while i<len(search_keyword):
items = []
iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
print (iteration)
print ("Evaluating...")
search_keywords = search_keyword[i]
search = search_keywords.replace(' ','%20')
try:
os.makedirs(search_keywords)
except OSError, e:
if e.errno != 17:
raise
pass
j = 0
while j<len(keywords):
pure_keyword = keywords[j].replace(' ','%20')
url = 'https://www.google.com/search?q=' + search + pure_keyword + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (download_page(url))
time.sleep(0.1)
items = items + (_images_get_all_items(raw_html))
j = j + 1
print ("Total Image Links = "+str(len(items)))
print ("\n")
info = open('output.txt', 'a')
info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n")
info.close()
t1 = time.time()
total_time = t1-t0
print("Total time taken: "+str(total_time)+" Seconds")
print ("Starting Download...")
k=0
errorCount=0
while(k<len(items)):
from urllib2 import Request,urlopen
from urllib2 import URLError, HTTPError
try:
req = Request(items[k], headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
response = urlopen(req,None,15)
output_file = open(search_keywords+"/"+str(k+1)+".jpg",'wb')
data = response.read()
output_file.write(data)
response.close();
print("completed ====> "+str(k+1))
k=k+1;
except IOError:
errorCount+=1
print("IOError on image "+str(k+1))
k=k+1;
except HTTPError as e:
errorCount+=1
print("HTTPError"+str(k))
k=k+1;
except URLError as e:
errorCount+=1
print("URLError "+str(k))
k=k+1;
i = i+1
print("\n")
print("Everything downloaded!")
print("\n"+str(errorCount)+" ----> total Errors")
I thought editing the below code, will help in making the code work with BeautifulSoup Library and my work will be completed faster:
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
#response = urllib2.urlopen(req)
#page = response.read()
return BeautifulSoup(urlopen(Request(req)), 'html.parser')
except:
return"Page Not found"
But the above code is returning blank. Kindly, let me know what I might do to make the code work excellently well with BeautifulSoup without any trouble.
You can't just pass Google headers like that. The search engine is ALOT more complex than simply substituting some keywords into a GET URL.
HTML is a markup language only useful for one way rendering of human readable information. For your application, you need machine readable markup rather than trying to decipher human readable text. Google already has a very comprehensive API https://developers.google.com/custom-search/ which is easy to use, and a much better way of achieving this than using BeautifulSoup
I use python to download images from some website, some times the image's content-length is zero. The image can be accessed normally in web browser.
I have tried three methodes, and get the same result, so how to resolve this problem?
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 20 13:51:42 2017
"""
import urllib
import urllib2
import re
import uuid
import os
import requests
from lxml import etree
from multiprocessing import Pool
url = 'https://www.sina.com.cn/'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
request = urllib2.Request(url)
request.add_header('User-Agent', user_agent)
response = urllib2.urlopen(request)
content = response.read()
tree=etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
node=tree.xpath("//img/#src")
dic1={}
dic2={}
localPath='E:\\pictures\\'
def generateFileName():
return str(uuid.uuid1())
def createFileWithFileName(localPathParam,fileName):
totalPath=localPathParam+'\\'+fileName
if not os.path.exists(totalPath):
file=open(totalPath,'wb')
file.close()
return totalPath
def worker(i):
path = node[i]
if not (dic1.has_key(path)):
dic1[path] = 1
index = path.rfind('/')
suffix = path[index+1:]
filename = suffix
#filename = generateFileName()+'.'+suffix
if(re.search(r'^(https?:)?\/\/', path)):
#print('save picture %s as %s' % (path,filename))
'''
#this code get the same result too
try:
urllib.urlretrieve(path, createFileWithFileName(localPath, filename))
except Exception, ex:
print(ex.message)
'''
with open(localPath + filename, 'wb') as handle:
response = requests.get(path, timeout=60)
if not response.ok:
print response
else:
print 'wrong when get ' + path
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
'''
#this code get the same result too
try:
req = urllib2.Request(path)
req.add_header('User-Agent', user_agent)
picture = urllib2.urlopen(url=path, timeout=5).read()
document = open(localPath+filename,'wb')
document.write(picture)
document.close()
except Exception, ex:
print(ex.message)
'''
if __name__=='__main__':
p = Pool()
for i in range(len(node)):
p.apply_async(worker, args=(i,))
print 'Waiting for all subprocesses done...'
p.close()
p.join()
print 'All subprocesses done.'
Little bit of background: I'm using Python 2.7.12 on a Windows 10 computer.
This is by far one of the oddest problems I have ever encountered with Python.
I have written a script that makes a GET request to an API, with the correct headers, and gets some XML data back. For the record, when I paste the script like this in a python file and run it via CMD, it works perfectly fine.
But..
It stops working as soon as I wrap this inside a function. Nothing
else, just wrap it inside a function, and use
if __name__ == '__main__':
my_new_function()
to run it from CMD and it won't work anymore. It still works but the API says I have wrong auth credentials, and thus I don't get any data back.
I went over every piece of string that is in this code, and it's all ASCII encoded. I also checked the timestamps, and they are all correct.
This is my script:
SECRET_KEY = 'YYY'
PUBLIC_KEY = 'XXX'
content_type = 'application/xml'
date = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime())
method = 'GET'
uri = '/uri'
msg = """{method}
{content_type}
{date}
x-bol-date:{date}
{uri}""".format(content_type=content_type,
date=date,
method=method,
uri=uri)
h = hmac.new(
SECRET_KEY,
msg, hashlib.sha256)
b64 = base64.b64encode(h.digest())
signature = PUBLIC_KEY + b':' + b64
headers = {'Content-Type': content_type,
'X-BOL-Date': date,
'X-BOL-Authorization': signature}
r = requests.get('example.com/uri', headers=headers)
the same code inside a function:
def get_orders():
SECRET_KEY = 'XXX'
PUBLIC_KEY = 'YYY'
content_type = 'application/xml'
date = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime())
method = 'GET'
uri = '/uri'
msg = """{method}
{content_type}
{date}
x-bol-date:{date}
{uri}""".format(content_type=content_type,
date=date,
method=method,
uri=uri)
h = hmac.new(
SECRET_KEY,
msg, hashlib.sha256)
b64 = base64.b64encode(h.digest())
signature = PUBLIC_KEY + b':' + b64
headers = {'Content-Type': content_type,
'X-BOL-Date': date,
'X-BOL-Authorization': signature}
r = requests.get('example.com/uri', headers=headers)
if __name__ == '__main__':
get_orders()
I think your multi-line string is getting spaces in it when you indent it in a function. Concatenate it on each line instead and it should work.
I have a little problem with caching the images in the browser for my app-engine application
I`m sending last-modified, expires and cache-control headers but image is loaded from the server every time.
Here is the header part of the code:
response['Content-Type'] = 'image/jpg'
response['Last-Modified'] = current_time.strftime('%a, %d %b %Y %H:%M:%S GMT')
response['Expires'] = current_time + timedelta(days=30)
response['Cache-Control'] = 'public, max-age=2592000'
Here is an example code for my fix copy in dpaste here
def view_image(request, key):
data = memcache.get(key)
if data is not None:
if(request.META.get('HTTP_IF_MODIFIED_SINCE') >= data['Last-Modified']):
data.status_code = 304
return data
else:
image_content_blob = #some code to get the image from the data store
current_time = datetime.utcnow()
response = HttpResponse()
last_modified = current_time - timedelta(days=1)
response['Content-Type'] = 'image/jpg'
response['Last-Modified'] = last_modified.strftime('%a, %d %b %Y %H:%M:%S GMT')
response['Expires'] = current_time + timedelta(days=30)
response['Cache-Control'] = 'public, max-age=315360000'
response['Date'] = current_time
response.content = image_content_blob
memcache.add(image_key, response, 86400)
return response