It would be great if someone could help me with multi-threading this script and write the output to a text file.
I am really new at coding, so please help me out.
#!/usr/bin/python
from tornado import ioloop, httpclient
from BeautifulSoup import BeautifulSoup
from mechanize import Browser
import requests
import urllib2
import socket
import sys
def handle_request(response):
print response.code
global i
i = 0
i -= 1
if i == 0:
http_client = httpclient.AsyncHTTPClient()
for url in open('urls.txt'):
try:
br = Browser()
br.set_handle_robots(False)
res = br.open(url, None, 2.5)
data = res.get_data()
soup = BeautifulSoup(data)
title = soup.find('title')
if soup.title != None:
print url, title.renderContents(), '\n'
i += 1
except urllib2.URLError, e:
print "Oops, timed out?", '\n'
except socket.error,e:
print "Oops, timed out?", '\n'
except socket.timeout:
print "Oops, timed out?", '\n'
print 'Processing of list completed, Cheers!!'
sys.exit()
try:
ioloop.IOLoop.instance().start()
except KeyboardInterrupt:
ioloop.IOLoop.instance().stop()
I am trying to grep the HTTP title of a list of hosts.
The basic idea you have already implemented is an non-blocking HTTP client.
def handle_request(response):
if response.error:
print "Error:", response.error
else:
print response.body
for url in ["http://google.com", "http://twitter.com"]:
http_client = httpclient.AsyncHTTPClient()
http_client.fetch(url, handle_request)
You could loop over your urls and the callback will be called as soon the respone for a specific url becomes availible.
I wouldn't mix up mechanize, ioloop,... if not necessary.
Apart from that, I recommend grequests. It is a lightweight tool which satisfies your requirements.
import grequests
from bs4 import BeautifulSoup
urls = ['http://google.com', 'http://www.python.org/']
rs = (grequests.get(u) for u in urls)
res = grequests.map(rs)
for r in res:
soup = BeautifulSoup(r.text)
print "%s: %s" % (r.url, soup.title.text)
Related
I am connected to the web via VPN and I would like to connect to news site to grab, well, news. For this a library exists: finNews. And this is the code:
import FinNews as fn
cnbc_feed = fn.CNBC(topics=['finance', 'earnings'])
print(cnbc_feed.get_news())
print(cnbc_feed.possible_topics())
Now because of the VPN the connection wont work and it throws:
<urlopen error [WinError 10061] No connection could be made because
the target machine actively refused it ( client - server )
So I started separately to understand how to make a connection work and it does work (return is "connected"):
import urllib.request
proxy = "http://user:pw#proxy:port"
proxies = {"http":"http://%s" % proxy}
url = "http://www.google.com/search?q=test"
headers={'User-agent' : 'Mozilla/5.0'}
try:
proxy_support = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(proxy_support, urllib.request.HTTPHandler(debuglevel=1))
urllib.request.install_opener(opener)
req = urllib.request.Request(url, None, headers)
html = urllib.request.urlopen(req).read()
#print (html)
print ("Connected")
except (HTTPError, URLError) as err:
print("No internet connection.")
Now I figured how to access news and how to make a connection via VPN, but I cant bring both together. I want to grab the news via the library through VPN?! I am fairly new to Python so I guess I dont get the logic fully yet.
EDIT: I tried to combine with Feedparser, based on furas hint:
import urllib.request
import feedparser
proxy = "http://user:pw#proxy:port"
proxies = {"http":"http://%s" % proxy}
#url = "http://www.google.com/search?q=test"
#url = "http://www.reddit.com/r/python/.rss"
url = "https://timesofindia.indiatimes.com/rssfeedstopstories.cms"
headers={'User-agent' : 'Mozilla/5.0'}
try:
proxy_support = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(proxy_support, urllib.request.HTTPHandler(debuglevel=1))
urllib.request.install_opener(opener)
req = urllib.request.Request(url, None, headers)
html = urllib.request.urlopen(req).read()
#print (html)
#print ("Connected")
feed = feedparser.parse(html)
#print (feed['feed']['link'])
print ("Number of RSS posts :", len(feed.entries))
entry = feed.entries[1]
print ("Post Title :",entry.title)
except (HTTPError, URLError) as err:
print("No internet connection.")
But same error....this is a big nut to crack...
May I ask for your advice? Thank you :)
I am looking to parse data from a large number of webpages using Python (>10k) and I am finding that the function I have written to do this often encounters a timeout error every 500 loops. I have attempted to fix this with a try - except code block, but i would like to improve the function so it will re-attempt to open the url four or five times before returning the error. Is there an elegant way to do this?
My code below:
def url_open(url):
from urllib.request import Request, urlopen
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
s = urlopen(req,timeout=50).read()
except urllib.request.HTTPError as e:
if e.code == 404:
print(str(e))
else:
print(str(e))
s=urlopen(req,timeout=50).read()
raise
return BeautifulSoup(s, "lxml")
I've used a pattern like this for retrying in the past:
def url_open(url):
from urllib.request import Request, urlopen
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
retrycount = 0
s = None
while s is None:
try:
s = urlopen(req,timeout=50).read()
except urllib.request.HTTPError as e:
print(str(e))
if canRetry(e.code):
retrycount+=1
if retrycount > 5:
raise
# thread.sleep for a bit
else:
raise
return BeautifulSoup(s, "lxml")
You just have to define canRetry somewhere else.
Question: I've 3 URLS - testurl1, testurl2 and testurl3. I'd like to try testurl1 first, if I get 404 error then try testurl2, if that gets 404 error then try testurl3. How to achieve this? So far I've tried below but that works only for two url, how to add support for third url?
from urllib2 import Request, urlopen
from urllib2 import URLError, HTTPError
def checkfiles():
req = Request('http://testurl1')
try:
response = urlopen(req)
url1=('http://testurl1')
except HTTPError, URLError:
url1 = ('http://testurl2')
print url1
finalURL='wget '+url1+'/testfile.tgz'
print finalURL
checkfiles()
Another job for plain old for loop:
for url in testurl1, testurl2, testurl3
req = Request(url)
try:
response = urlopen(req)
except HttpError as err:
if err.code == 404:
continue
raise
else:
# do what you want with successful response here (or outside the loop)
break
else:
# They ALL errored out with HTTPError code 404. Handle this?
raise err
Hmmm maybe something like this?
from urllib2 import Request, urlopen
from urllib2 import URLError, HTTPError
def checkfiles():
req = Request('http://testurl1')
try:
response = urlopen(req)
url1=('http://testurl1')
except HTTPError, URLError:
try:
url1 = ('http://testurl2')
except HTTPError, URLError:
url1 = ('http://testurl3')
print url1
finalURL='wget '+url1+'/testfile.tgz'
print finalURL
checkfiles()
Still new with urllib2 so sorry if this is a basic question but I couldn't find an answer anywhere that made sense. Tried to make a basic web scraper on the raspberry pi with python 2.7.9 that finds a url in the site and downloads the file however I keep getting http 503 errors even if I loop over and retry it never goes through.
import cfscrape
import urllib2
def find_hyperlink( text, first, last):
try:
start = text.index(first) + len (first)
end = text.index (last, start)
return text[start:end]
except ValueError:
return ""
def download_hyperlink(name):
scraper = cfscrape.create_scraper()
r= scraper.get(name).content
link = find_hyperlink(r, 'Download (Save as...): <a href="', '">')
print link
opener = urllib2.build_opener()
opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
try:
response = opener.open(link)
print 'downloading episode'
except urllib2.HTTPError as e:
if e.code == 503:
download_hyperlink(name)
else:
raise
download_hyperlink('http://www.kiss-anime.me/Anime-sword-art-online-episode-1')
I want to be able to take a shortened or non-shortened URL and return its un-shortened form. How can I make a python program to do this?
Additional Clarification:
Case 1: shortened --> unshortened
Case 2: unshortened --> unshortened
e.g. bit.ly/silly in the input array should be google.com in the output array
e.g. google.com in the input array should be google.com in the output array
Send an HTTP HEAD request to the URL and look at the response code. If the code is 30x, look at the Location header to get the unshortened URL. Otherwise, if the code is 20x, then the URL is not redirected; you probably also want to handle error codes (4xx and 5xx) in some fashion. For example:
# This is for Py2k. For Py3k, use http.client and urllib.parse instead, and
# use // instead of / for the division
import httplib
import urlparse
def unshorten_url(url):
parsed = urlparse.urlparse(url)
h = httplib.HTTPConnection(parsed.netloc)
h.request('HEAD', parsed.path)
response = h.getresponse()
if response.status/100 == 3 and response.getheader('Location'):
return response.getheader('Location')
else:
return url
Using requests:
import requests
session = requests.Session() # so connections are recycled
resp = session.head(url, allow_redirects=True)
print(resp.url)
Unshorten.me has an api that lets you send a JSON or XML request and get the full URL returned.
If you are using Python 3.5+ you can use the Unshortenit module that makes this very easy:
from unshortenit import UnshortenIt
unshortener = UnshortenIt()
uri = unshortener.unshorten('https://href.li/?https://example.com')
Open the url and see what it resolves to:
>>> import urllib2
>>> a = urllib2.urlopen('http://bit.ly/cXEInp')
>>> print a.url
http://www.flickr.com/photos/26432908#N00/346615997/sizes/l/
>>> a = urllib2.urlopen('http://google.com')
>>> print a.url
http://www.google.com/
To unshort, you can use requests. This is a simple solution that works for me.
import requests
url = "http://foo.com"
site = requests.get(url)
print(site.url)
http://github.com/stef/urlclean
sudo pip install urlclean
urlclean.unshorten(url)
Here a src code that takes into account almost of the useful corner cases:
set a custom Timeout.
set a custom User Agent.
check whether we have to use an http or https connection.
resolve recursively the input url and prevent ending within a loop.
The src code is on github # https://github.com/amirkrifa/UnShortenUrl
comments are welcome ...
import logging
logging.basicConfig(level=logging.DEBUG)
TIMEOUT = 10
class UnShortenUrl:
def process(self, url, previous_url=None):
logging.info('Init url: %s'%url)
import urlparse
import httplib
try:
parsed = urlparse.urlparse(url)
if parsed.scheme == 'https':
h = httplib.HTTPSConnection(parsed.netloc, timeout=TIMEOUT)
else:
h = httplib.HTTPConnection(parsed.netloc, timeout=TIMEOUT)
resource = parsed.path
if parsed.query != "":
resource += "?" + parsed.query
try:
h.request('HEAD',
resource,
headers={'User-Agent': 'curl/7.38.0'}
}
)
response = h.getresponse()
except:
import traceback
traceback.print_exec()
return url
logging.info('Response status: %d'%response.status)
if response.status/100 == 3 and response.getheader('Location'):
red_url = response.getheader('Location')
logging.info('Red, previous: %s, %s'%(red_url, previous_url))
if red_url == previous_url:
return red_url
return self.process(red_url, previous_url=url)
else:
return url
except:
import traceback
traceback.print_exc()
return None
You can use geturl()
from urllib.request import urlopen
url = "bit.ly/silly"
unshortened_url = urlopen(url).geturl()
print(unshortened_url)
# google.com
This Is very easy task you just need to add 4 lines of codes thats it :)
import requests
url = input('Enter url : ')
site = requests.get(url)
print(site.url)
just run this code you will successfully unshort the url.