Trying to scrape a website using Python, LXML, PyQt - Getting wierd results

Trying to scrape a website using Python, LXML, PyQt - Getting wierd results - python

So using code shown in https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/ I'm trying to retrieve the euro value from this link https://btcdirect.eu/nl-nl
But this is the result im getting: [u'1 BTC\xa0', u'\n\t ?\xa00,00\n ']
Can anyone help me out?
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
#Take this class for granted.Just use result of rendering.
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'https://btcdirect.eu/nl-nl'
r = Render(url)
result = r.frame.toHtml()
#This step is important.Converting QString to Ascii for lxml to process
archive_links = html.fromstring(str(result.toAscii()))
#QString should be converted to string before processed by lxml
formatted_result = str(result.toAscii())
#Next build lxml tree from formatted_result
tree = html.fromstring(formatted_result)
#Now using correct Xpath we are fetching URL of archives
archive_links = tree.xpath('//*[#id="bitcoinkoers"]/strong[1]/text()')
print archive_links

Related

loop over a list of urls using PyQt4

I am trying to loop over a list of URLs using PyQt4 and Beautifulsoup using the following code:
import sys
from bs4 import BeautifulSoup
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, pyqtSignal
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, urls, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.urls = urls
self.cb = cb
self.crawl()
self.app.exec_()
def crawl(self):
if self.urls:
url = self.urls.pop(0)
print ('Downloading', url)
self.mainFrame().load(QUrl(url))
else:
self.app.quit()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.crawl()
def scrape(url, html):
pass
soup = BeautifulSoup(unicode(html), "lxml")
t = soup.findAll("div", {"class": "detalhamento_label_valor hidden-print ng-binding"})[0].text
print t
urls = ["http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000005?modulo=0&sistema=portal" ,
"http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000005?modulo=0&sistema=portal" ,
"http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000004?modulo=0&sistema=portal" ]
r = Render(urls, cb=scrape)
It seems to work well if the urls are the same [0,1], but it gets stuck once the url changes [2]. I am not really familiar with PyQt4, so I wonder if there is something trivial I might be missing.
EDIT
The program hangs while running the third item of the url list on this operation:
self.mainFrame().load(QUrl(url))
Other than that, the only warning I get is:
libpng warning: iCCP: known incorrect sRGB profile
Though I'm not sure what it means, it does not seem to be connected to the issue.

python qt4 : To referesh and scrape again

I have written a small python code to scrape a table in a web page. It uses qt4 to scrape. Now, The problem is I need to keep scraping the data every 5 mins. I am thinking of refreshing the page and scrape again. How can I refresh the webpage and scrape again every 5 mins?
Below is the code which I am using to scrape.
import sys
from BeautifulSoup import BeautifulSoup
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
import redis
from time import sleep
class Scraper(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
#self.render = Scraper(url)
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def close_app(self):
self.app.quit()
print "closed"
url = 'https://www.nseindia.com/live_market/dynaContent/live_analysis/top_gainers_losers.htm?cat=G'
r = Scraper(url)
result = r.frame.toHtml()
formatted_result = str(result.toAscii())
soup = BeautifulSoup(formatted_result)
table = soup.find(id="topGainers")
print table

Check this page out.
It provides a very light-weight library for scheduling tasks and should work fine within Qt. How do I get a Cron like scheduler in Python?
But if you are worried about your GUI freezing or just wanting to keep everything native within Qt, check this out : Background thread with QThread in PyQt.

You can use QtCore.QTimer.singleShot(5 * 60, func) function.
def __init__(self, url):
# ...
self.show_page()
def show_page(self)
# display page here
QtCore.QTimer.singleShot(5 * 60, self.show_page)

pyqt4: Loop main Render class?

I have a PyQt4 class that downloads webpages that I use for scrapping purposes.
When I pass a list of urls to the Render class while instantiating it works fine(single call) but when I try to loop the [r = Render(url, cb=scrape)]with a multiple list of urls, after the first loop,the execution stops or hangs without any error thrown.
I want to loop the class separately because the urls list belong to different category and will have to store the contents extracted separately.
I also came to know that only one app can be initiated, if that is the case how to exit the app without quitting it. so that new url list can be used by the same app
I am stuck with this issue for a while. Thanks in advance
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, urls, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.urls = urls
self.cb = cb
self.crawl()
self.app.exec_()
def crawl(self):
if self.urls:
url = self.urls.pop(0)
print 'Downloading', url
self.mainFrame().load(QUrl(url))
else:
self.app.quit()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.crawl()
def scrape(url, html):
pass # have scraping code here
url1 = ['http://webscraping.com', 'http://webscraping.com/blog']
url2 = ['http://webscraping.com', 'http://webscraping.com/blog']
urls =[]
urls.append(url1)
urls.append(url2)
for url in urls:
r = Render(url, cb=scrape)

The problem is you can only instantiate a single QApplication object. Here is an updated version that avoids this and then only runs Qt's execution loop when downloading a URL:
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.cb = cb
def crawl(self, url):
print 'Downloading', url
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.app.quit()
def scrape(url, html):
pass # add scraping code here
print len(html)
r = Render(cb=scrape)
urls = ['http://webscraping.com', 'http://webscraping.com/blog']
for url in urls:
r.crawl(url)

unfortunately, #hoju 's answer did not work for me.
here is what works for me (basically setting up a timer to check if loading has completed).
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QTimer
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, url):
QWebPage.__init__(self)
self.frame = None
self.mainFrame().loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
def go_again():
global r, timer, urls
if(len(urls)>0):
print("loading",urls[0])
r = Render(urls.pop())
timer.start(1000)
else:
print("finished")
sys.exit(app.exec_())
def check_done():
global r, timer
if r.frame is not None:
timer.stop()
html_result = r.frame.toHtml()
#do something with html
print("loaded")
go_again()
app = QApplication(sys.argv)
urls = ['http://stackoverflow.com/questions/34603886/pyqt4-loop-main-render-class','http://stackoverflow.com/questions/34603886/pyqt4-loop-main-render-class']
timer = QTimer()
timer.timeout.connect(check_done)
#check every second
go_again()
sys.exit(app.exec_())

Extracting text between the <span> tags with xpath in Python

I'm trying to use python to extract a metric from a website: http://www.bild.de/regional/hamburg/mord/das-denkt-der-presserat-ueber-den-mord-an-unserer-tochter-lisa-41186944.bild.html
I need the text (number) under the yellow "LACHEN" button (now at 149). The XPath to that specific element is //*[#id="jsm_16584"]/ul/li[1]/span
However it does not return any object when I try to query it:
url = "http://www.bild.de/regional/hamburg/mord/das-denkt-der-presserat-ueber-den-mord-an-unserer-tochter-lisa-41186944.bild.html"
req=urllib2.Request(url)
tree = lxml.html.fromstring(urllib2.urlopen(req).read())
metric=tree.xpath('//*[#id="jsm_16584"]/ul/li[1]/span')
print metric
It returns metric as an empty list.

The urlopen is not executing any script you just get raw html so if the data are generated by javascript they are not rendered by using this method. Something like this should work:
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://www.bild.de/regional/hamburg/mord/das-denkt-der-presserat-ueber-den-mord-an-unserer-tochter-lisa-41186944.bild.html'
r = Render(url)
page = r.frame.toHtml()
tree = html.fromstring(page)
metric=tree.xpath('//button[#class="btn-mood-1"]/#data-mood-count')
print(metric)

Scraping Javascript driven web pages with PyQt4 - how to access pages that need authentication?

I have to scrape a very, very simple page on our company's intranet in order to automate one of our internal processes (returning a function's output as successful or not).
I found the following example:
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://sitescraper.net'
r = Render(url)
html = r.frame.toHtml()
From http://blog.sitescraper.net/2010/06/scraping-javascript-webpages-in-python.html and it's almost perfect. I just need to be able to provide authentication to view the page.
I've been looking through the documentation for PyQt4 and I'll admit a lot of it is over my head. If anyone could help, I'd appreciate it.
Edit:
Unfortunately gruszczy's method didn't work for me. When I had done something similar through urllib2, I used the following code and it worked...
username = 'user'
password = 'pass'
req = urllib2.Request(url)
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
authheader = "Basic %s" % base64string
req.add_header("Authorization", authheader)
handle = urllib2.urlopen(req)

I figured it out. Here's what I ended up with in case it can help someone else.
#!/usr/bin/python
# -*- coding: latin-1 -*-
import sys
import base64
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4 import QtNetwork
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
username = 'username'
password = 'password'
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
authheader = "Basic %s" % base64string
headerKey = QByteArray("Authorization")
headerValue = QByteArray(authheader)
url = QUrl(url)
req = QtNetwork.QNetworkRequest()
req.setRawHeader(headerKey, headerValue)
req.setUrl(url)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(req)
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def main():
url = 'http://www.google.com'
r = Render(url)
html = r.frame.toHtml()

Try this:
url = QUrl(url)
url.setUserName(username)
url.setPassword(password)
self.mainFrame().load(url)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Trying to scrape a website using Python, LXML, PyQt - Getting wierd results - python

Related

loop over a list of urls using PyQt4

python qt4 : To referesh and scrape again

pyqt4: Loop main Render class?

Extracting text between the <span> tags with xpath in Python

Scraping Javascript driven web pages with PyQt4 - how to access pages that need authentication?

Categories

Resources