Python Short Url expander - python

i have a problem with expanding short URLs, since not all i work with use the same redirection:
the idea is to expand shortened urls: here a few examples of short url --> Final url. I need a function to get the shorten url and return the expanded url
http://chollo.to/675za --> http://www.elcorteingles.es/limite-48-horas/equipaje/?sorting=priceAsc&aff_id=2118094&dclid=COvjy8Xrz9UCFeMi0wod4ZULuw
So fa i have something semi working, it fails in the some of the abobe examples
import requests
import httplib
import urlparse
def unshorten_url(url):
try:
parsed = urlparse.urlparse(url)
h = httplib.HTTPConnection(parsed.netloc)
h.request('HEAD', parsed.path)
response = h.getresponse()
if response.status / 100 == 3 and response.getheader('Location'):
url = requests.get(response.getheader('Location')).url
print url
return url
else:
url = requests.get(url).url
print url
return url
except Exception as e:
print(e)

The expected redirect does not appear to be well-formed according to requests:
import requests
response = requests.get('http://chollo.to/675za')
for resp in response.history:
print(resp.status_code, resp.url)
print(response.url)
print(response.is_redirect)
Output:
301 http://chollo.to/675za
http://web.epartner.es/click.asp?ref=754218&site=14010&type=text&tnb=39&diurl=https%3A%2F%2Fad.doubleclick.net%2Fddm%2Fclk%2F302111021%3B129203261%3By%3Fhttp%3A%2F%2Fwww.elcorteingles.es%2Flimite-48-horas%2Fequipaje%2F%3Fsorting%3DpriceAsc%26aff_id%3D2118094
False
This is likely intentional by epartner or doubleclick. For these types of nested urls you would need an extra step like:
from urllib.parse import unquote
# from urllib import unquote # python2
# if response.url.count('http') > 1:
url = 'http' + response.url.split('http')[-1]
unquote(url)
# http://www.elcorteingles.es/limite-48-horas/equipaje/?sorting=priceAsc&aff_id=2118094
Note: by doing this you might be avoiding intended ad revenues.

Related

How to continue other link when Urllib 404 error

I'm trying to download images from a csv with lot of links. Works fine until some link is broken (urllib.error.HTTPError: HTTP Error 404: Not Found) .
import pandas as pd
import urllib.request
import urllib.error
opener = urllib.request.build_opener()
def url_to_jpg (i,url,file_path) :
filename="image-{}".format(i)
full_path = "{}{}".format(file_path, filename)
opener.addheaders = [('User-Agent', 'Chrome/5.0')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(url,full_path)
print ("{} Saved".format(filename))
return None
filename="listado.csv"
file_path="/Users/marcelomorelli/Downloads/tapas/imagenes"
urls=pd.read_csv(filename)
for i, url in enumerate(urls.values):
url_to_jpg (i,url[0],file_path)
Thanks!
Any idea how can I made to Python continue to the other link in the list everytime gets that error?
You can use a try pattern and ignore errors.
Your code would look like this:
for i, url in enumerate(urls.values):
try:
url_to_jpg(i,url[0],file_path)
except Exception as e:
print(f"Failed due to: {e}")
Reference: https://docs.python.org/3/tutorial/errors.html

Unshorten url in python 3

I am using this code for unshortening urls in python 3 , but the code returns the url as it is (shortened), so what should I do to get it unshortened?
import requests
import http.client
import urllib.parse as urlparse
def unshortenurl(url):
parsed = urlparse.urlparse(url)
h = http.client.HTTPConnection(parsed.netloc)
h.request('HEAD', parsed.path)
response = h.getresponse()
if response.status/100 == 3 and response.getheader('Location'):
return response.getheader('Location')
else: return url
In python3 response.status/100 == 3 would be True only for status code 300. For any other 3xx code it would be False. Use floor division instead response.status//100 == 3 or some other way to test for redirection codes.
EDIT: It looks you are using the code from SO question posted by #Aybars and there is comment at the top of the snippet what to do in python3. Also, it would have been nice to mention the source of the code.

Extract part of a url using pattern matching in python

I want to extract part of a url using pattern matching in python from a list of links
Examples:
http://www.fairobserver.com/about/
http://www.fairobserver.com/about/interview/
This is my regex :
re.match(r'(http?|ftp)(://[a-zA-Z0-9+&/##%?=~_|!:,.;]*)(.\b[a-z]{1,3}\b)(/about[a-zA-Z-_]*/?)', str(href), re.IGNORECASE)
I want to get links ending only with /about or /about/
but the above regex selects all links with "about" word in it
Suggest you parse your URLs using an appropriate library, e.g. urlparse instead.
E.g.
import urlparse
samples = [
"http://www.fairobserver.com/about/",
"http://www.fairobserver.com/about/interview/",
]
def about_filter(urls):
for url in urls:
parsed = urlparse.urlparse(url)
if parsed.path.endswith('/about/'):
yield url
Yielding:
>>> print list(about_filter(samples))
['http://www.fairobserver.com/about/']
Or
def about_filter(urls):
for url in urls:
parsed = urlparse.urlparse(url)
if parsed.path.startswith('/about'):
yield url
Yielding
>>> print list(about_filter(samples))
['http://www.fairobserver.com/about/', 'http://www.fairobserver.com/about/interview/']
Matching the path of exactly /about/ or /about per your comment clarification.
Below is using urlparse in python2/3.
try:
# https://docs.python.org/3.5/library/urllib.parse.html?highlight=urlparse#urllib.parse.urlparse
# python 3
from urllib.parse import urlparse
except ImportError:
# https://docs.python.org/2/library/urlparse.html#urlparse.urlparse
# python 2
from urlparse import urlparse
urls = (
'http://www.fairobserver.com/about/',
'http://www.fairobserver.com/about/interview/',
'http://www.fairobserver.com/interview/about/',
)
for url in urls:
print("{}: path is /about? {}".format(url,
urlparse(url.rstrip('/')).path == '/about'))
Here is the output:
http://www.fairobserver.com/about/: path is /about? True
http://www.fairobserver.com/about/interview/: path is /about? False
http://www.fairobserver.com/interview/about/: path is /about? False
The important part is urlparse(url.rstrip('/')).path == '/about', normalizing the url by stripping off the trailing / before parsing so that we don't have to use regex.
If you just want links ending in either use a html parser and str.endwith:
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.fairobserver.com/about/")
print(list(filter(lambda x: x.endswith(("/about", '/about/')),
(a["href"] for a in BeautifulSoup(r.content).find_all("a", href=True)))))
You can also use a regex with BeautifulSoup:
r = requests.get("http://www.fairobserver.com/about/")
print([a["href"] for a in BeautifulSoup(r.content).find_all(
"a", href=re.compile(".*/about/$|.*/about$"))])

Is there way to ignore 302 Moved Temporarily redirection or find what it is caused by?

I am Writing some parsing script and need to access to many web pages like this one.
Whenever I try to get this page with urlopen and then read(), I get redirected to this page.
When I launch the same links from google chrome redirect happens but really rarely, most of times when I try to launch url not by clicking it from site menus.
Is there way to dodge that redirect or simulate jump to url from web-site menus with python3?
Example code:
def getItemsFromPage(url):
with urlopen(url) as page:
html_doc = str(page.read())
return re.findall('(http://www.charitynavigator.org/index.cfm\?bay=search\.summary&orgid=[\d]+)', html_doc)
url = 'http://www.charitynavigator.org/index.cfm?bay=search.alpha&ltr=1'
items_urls = getItemsFromPage(url)
with urlopen(item_urls[0]) as item_page:
print(item_page.read().decode('utf-8')) # Here i get search.advanced instead of item page
In fact, it's a weird problem with ampersands from raw html data. When you visit the webpage and click on link ampersands (&amp) are read by web navigator as "&" and it work. However, python reads data as it is, that is raw data. So:
import urllib.request as net
from html.parser import HTMLParser
import re
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0",
}
def unescape(items):
html = HTMLParser()
unescaped = []
for i in items:
unescaped.append(html.unescape(i))
return unescaped
def getItemsFromPage(url):
request = net.Request(url, headers=headers)
response = str(net.urlopen(request).read())
# --------------------------
# FIX AMPERSANDS - unescape
# --------------------------
links = re.findall('(http://www.charitynavigator.org/index.cfm\?bay=search\.summary&orgid=[\d]+)', response)
unescaped_links = unescape(links)
return unescaped_links
url = 'http://www.charitynavigator.org/index.cfm?bay=search.alpha&ltr=1'
item_urls = getItemsFromPage(url)
request = net.Request(item_urls[0], headers=headers)
print(item_urls)
response = net.urlopen(request)
# DEBUG RESPONSE
print(response.url)
print(80 * '-')
print("<title>Charity Navigator Rating - 10,000 Degrees</title>" in (response.read().decode('utf-8')))
Your problem is not replacing & with & in the url string. I rewrite your code using urllib3 as below and got the expected webpages.
import re
import urllib3
def getItemsFromPage(url):
# create connection pool object (urllib3-specific)
localpool = urllib3.PoolManager()
with localpool.request('get', url) as page:
html_doc = page.data.decode('utf-8')
return re.findall('(http://www.charitynavigator.org/index.cfm\?bay=search\.summary&orgid=[\d]+)', html_doc)
# the master webpage
url_master = 'http://www.charitynavigator.org/index.cfm?bay=search.alpha&ltr=1'
# name and store the downloaded contents for testing purpose.
file_folder = "R:"
file_mainname = "test"
# parse the master webpage
items_urls = getItemsFromPage(url_master)
# create pool
mypool = urllib3.PoolManager()
i=0;
for url in items_urls:
# file name to be saved
file_name = file_folder + "\\" + file_mainname + str(i) + ".htm"
# replace '&' with r'&'
url_OK = re.sub(r'&', r'&', url)
# print revised url
print(url_OK)
### the urllib3-pythonic way of web page retrieval ###
with mypool.request('get', url_OK) as page, open(file_name, 'w') as f:
f.write(page.data.decode('utf-8'))
i+=1
(verified on python 3.4 eclipse PyDev win7 x64)

Unshorten Flic.kr URLs

I have a Python script that unshortens URLs based on the answer posted here. So far it worked pretty well, e.g., with youtu.be, goo.gl,t.co, bit.ly, and tinyurl.com. But now I noticed that it doesn't work for Flickr's own URL shortener flic.kr.
For example, when I enter the URL
https://flic.kr/p/qf3mGd
into a browser, I get redirected correctly to
https://www.flickr.com/photos/106783633#N02/15911453212/
However, when using to unshorten the same URL with the Python script I get the following re-directs
https://flic.kr/p/qf3mgd
http://www.flickr.com/photo.gne?short=qf3mgd
http://www.flickr.com/signin/?acf=%2Fphoto.gne%3Fshort%3Dqf3mgd
https://login.yahoo.com/config/login?.src=flickrsignin&.pc=8190&.scrumb=[...]
thus eventually ending up on the Yahoo login page. Unshort.me, by the way, can unshorten the URL correctly. What am I missing here?
Here is the full source code of my script. I stumbled upon some pathological cases with the original script:
import urlparse
import httplib
def unshorten_url(url, max_tries=10):
return __unshorten_url(url, [], max_tries)
def __unshorten_url(url, check_urls, max_tries):
if max_tries == 0:
if len(check_urls) > 0:
return check_urls[0]
return url
if url in check_urls:
return url
unshortended = ''
try:
parsed = urlparse.urlparse(url)
h = httplib.HTTPConnection(parsed.netloc)
h.request('HEAD', url)
except:
return None
try:
response = h.getresponse()
except:
return url
if response.status/100 == 3 and response.getheader('Location'):
unshortended = response.getheader('Location')
else:
return url
#print max_tries, unshortended
if unshortended != url:
if 'http' not in unshortended:
return url
check_urls.append(url)
return __unshorten_url(unshortended, check_urls, (max_tries-1))
else:
return unshortended
print unshorten_url('http://t.co/5skmePb7gp')
EDIT: Full working example with a t.co URL
I'm using Request [0] rather than httplib in this way and it's works fine with https://flic.kr/p/qf3mGd like urls:
>>> import requests
>>> requests.head("https://flic.kr/p/qf3mGd", allow_redirects=True, verify=False).url
u'https://www.flickr.com/photos/106783633#N02/15911453212/'
[0] http://docs.python-requests.org/en/latest/

Categories

Resources