Scrape multiple urls using QWebPage

Scrape multiple urls using QWebPage - python

I'm using Qt's QWebPage to render a page that uses javascript to update its content dynamically - so a library that just downloads a static version of the page (such as urllib2) won't work.
My problem is, when I render a second page, about 99% of the time the program just crashes. At other times, it will work three times before crashing. I've also gotten a few segfaults, but it is all very random.
My guess is the object I'm using to render isn't getting deleted properly, so trying to reuse it is possibly causing some problems for myself. I've looked all over and no one really seems to be having this same issue.
Here's the code I'm using. The program downloads web pages from steam's community market so I can create a database of all the items. I need to call the getItemsFromPage function multiple times to get all of the items, as they are broken up into pages (showing results 1-10 out of X amount).
import csv
import re
import sys
from string import replace
from bs4 import BeautifulSoup
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Item:
__slots__ = ("name", "count", "price", "game")
def __repr__(self):
return self.name + "(" + str(self.count) + ")"
def __str__(self):
return self.name + ", " + str(self.count) + ", $" + str(self.price)
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
self.deleteLater()
def getItemsFromPage(appid, page=1):
r = Render("http://steamcommunity.com/market/search?q=appid:" + str(appid) + "#p" + str(page))
soup = BeautifulSoup(str(r.frame.toHtml().toUtf8()))
itemLst = soup.find_all("div", "market_listing_row market_recent_listing_row")
items = []
for k in itemLst:
i = Item()
i.name = k.find("span", "market_listing_item_name").string
i.count = int(replace(k.find("span", "market_listing_num_listings_qty").string, ",", ""))
i.price = float(re.search(r'\$([0-9]+\.[0-9]+)', str(k)).group(1))
i.game = appid
items.append(i)
return items
if __name__ == "__main__":
print "Updating market items to dota2.csv ..."
i = 1
with open("dota2.csv", "w") as f:
writer = csv.writer(f)
r = None
while True:
print "Page " + str(i)
items = getItemsFromPage(570)
if len(items) == 0:
print "No items found, stopping..."
break
for k in items:
writer.writerow((k.name, k.count, k.price, k.game))
i += 1
print "Done."
Calling getItemsFromPage once works fine. Subsequent calls give me my problem. The output of the program is typically
Updating market items to dota2.csv ...
Page 1
Page 2
and then it crashes. It should go on for over 700 pages.

The problem with your program is that you are attempting to create a new QApplication with every url you fetch.
Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.
Here are some examples which show how to use the WebPage class:
Usage:
def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))
import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)
# example 1: process list of urls
urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)
# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))
sys.exit(app.exec_())
PyQt5 WebPage:
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)
def process(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True
def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)
PyQt4 WebPage:
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True
def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)

Related

Printing several PDFs using PyQt5 [duplicate]

I'm using Qt's QWebPage to render a page that uses javascript to update its content dynamically - so a library that just downloads a static version of the page (such as urllib2) won't work.
My problem is, when I render a second page, about 99% of the time the program just crashes. At other times, it will work three times before crashing. I've also gotten a few segfaults, but it is all very random.
My guess is the object I'm using to render isn't getting deleted properly, so trying to reuse it is possibly causing some problems for myself. I've looked all over and no one really seems to be having this same issue.
Here's the code I'm using. The program downloads web pages from steam's community market so I can create a database of all the items. I need to call the getItemsFromPage function multiple times to get all of the items, as they are broken up into pages (showing results 1-10 out of X amount).
import csv
import re
import sys
from string import replace
from bs4 import BeautifulSoup
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Item:
__slots__ = ("name", "count", "price", "game")
def __repr__(self):
return self.name + "(" + str(self.count) + ")"
def __str__(self):
return self.name + ", " + str(self.count) + ", $" + str(self.price)
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
self.deleteLater()
def getItemsFromPage(appid, page=1):
r = Render("http://steamcommunity.com/market/search?q=appid:" + str(appid) + "#p" + str(page))
soup = BeautifulSoup(str(r.frame.toHtml().toUtf8()))
itemLst = soup.find_all("div", "market_listing_row market_recent_listing_row")
items = []
for k in itemLst:
i = Item()
i.name = k.find("span", "market_listing_item_name").string
i.count = int(replace(k.find("span", "market_listing_num_listings_qty").string, ",", ""))
i.price = float(re.search(r'\$([0-9]+\.[0-9]+)', str(k)).group(1))
i.game = appid
items.append(i)
return items
if __name__ == "__main__":
print "Updating market items to dota2.csv ..."
i = 1
with open("dota2.csv", "w") as f:
writer = csv.writer(f)
r = None
while True:
print "Page " + str(i)
items = getItemsFromPage(570)
if len(items) == 0:
print "No items found, stopping..."
break
for k in items:
writer.writerow((k.name, k.count, k.price, k.game))
i += 1
print "Done."
Calling getItemsFromPage once works fine. Subsequent calls give me my problem. The output of the program is typically
Updating market items to dota2.csv ...
Page 1
Page 2
and then it crashes. It should go on for over 700 pages.

The problem with your program is that you are attempting to create a new QApplication with every url you fetch.
Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.
Here are some examples which show how to use the WebPage class:
Usage:
def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))
import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)
# example 1: process list of urls
urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)
# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))
sys.exit(app.exec_())
PyQt5 WebPage:
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)
def process(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True
def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)
PyQt4 WebPage:
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True
def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)

How to make PyQt5 wait n seconds before returning page

I am trying to scrape a webpage using the following PyQt5 function by passing the url into the function as a parameter. the function returns the page data:
def get_html(url):
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.app = APP.instance()
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.load(QUrl(url))
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(url).html
but the program faces a cloudflare error for unknown reasons to me, and it needs a couple of seconds to load the page properly. So how can I make the function wait for n seconds to fully load the page before proceeding to return the page data?

Instead of directly invoking toHtml you should use a QTimer:
from PyQt5.QtCore import QTimer, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage
APP = QApplication.instance()
if APP is None:
APP = QApplication([])
class PageHelper(QWebEnginePage):
def __init__(self, url, timeout=0):
self.html = ""
self.timeout = timeout
app = APP.instance()
super().__init__()
self.loadFinished.connect(self.handle_load_finished)
self.load(QUrl.fromUserInput(url))
app.exec_()
def handle_load_finished(self, ok):
if ok:
QTimer.singleShot(self.timeout * 1000, self.to_html)
else:
QApplication.quit()
def to_html(self):
self.toHtml(self._html_callable)
def _html_callable(self, html):
self.html = html
QApplication.quit()
def get_html(url, timeout=0):
page = PageHelper(url, timeout)
return page.html
print(get_html("https://stackoverflow.com/questions/68854582", timeout=5))

PyQt5 returns None value when loading Web page content

I am trying to get the content of a web page section. The data in that section is loaded dynamically by javascript. I found some code on here, edited it but when I run the script I return None
Here's the code
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from pprint import pprint
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
def main():
page = Page('https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all')
soup = bs.BeautifulSoup(page.html, 'html.parser')
section = soup.find('table', {'id' : 'DataTables_Table_0'})
pprint (section)
if __name__ == '__main__': main()
Here's the output
Load finished
None

The loadFinished signal only indicates that the page has been loaded but after that more DOM elements can be created, and that is the case of the element with id "DataTables_Table_0" which is created moments after the page is loaded.
A possible solution is to inject a script that checks if the element exists, and that notifies so that the HTML is obtained.
import sys
from functools import cached_property
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets, QtWebChannel
from pprint import pprint
import bs4 as bs
def get_webchannel_source():
file = QtCore.QFile(":/qtwebchannel/qwebchannel.js")
if not file.open(QtCore.QIODevice.ReadOnly):
return ""
content = file.readAll()
file.close()
return content.data().decode()
class Manager(QtCore.QObject):
def __init__(self, *, offline=True, visible=False, parent=None):
super().__init__(parent)
self._html = ""
self._is_finished = False
self.app
self._profile = (
QtWebEngineWidgets.QWebEngineProfile()
if offline
else QtWebEngineWidgets.QWebEngineProfile.defaultProfile()
)
self.view.resize(640, 480)
if not visible:
self.view.setAttribute(QtCore.Qt.WA_DontShowOnScreen, True)
self.view.show()
self.webchannel.registerObject("manager", self)
self.view.page().setWebChannel(self.webchannel)
#cached_property
def app(self):
return QtWidgets.QApplication(sys.argv)
#property
def profile(self):
return self._profile
#cached_property
def view(self):
view = QtWebEngineWidgets.QWebEngineView()
page = QtWebEngineWidgets.QWebEnginePage(self.profile, self)
view.setPage(page)
return view
#cached_property
def webchannel(self):
return QtWebChannel.QWebChannel(self)
#property
def html(self):
return self._html
def set_script(self, script):
qscript = QtWebEngineWidgets.QWebEngineScript()
qscript.setName("qscript")
qscript.setSourceCode(get_webchannel_source() + "\n" + script)
qscript.setInjectionPoint(QtWebEngineWidgets.QWebEngineScript.DocumentReady)
qscript.setWorldId(QtWebEngineWidgets.QWebEngineScript.MainWorld)
self.profile.scripts().insert(qscript)
def start(self, url):
self.view.load(QtCore.QUrl.fromUserInput(url))
self.app.exec_()
#QtCore.pyqtSlot()
def save_html(self):
if not self._is_finished:
self.view.page().toHtml(self.html_callable)
self._is_finished = True
def html_callable(self, html):
self._html = html
self.app.quit()
JS = """
var manager = null;
function find_element() {
var e = document.getElementById('DataTables_Table_0');
console.log("try verify", e, manager);
if (e != null && manager != null) {
console.log(e)
manager.save_html()
} else {
setTimeout(find_element, 100);
}
}
(function wait_qt() {
if (typeof qt != 'undefined') {
console.log("Qt loaded");
new QWebChannel(qt.webChannelTransport, function (channel) {
manager = channel.objects.manager;
find_element();
});
} else {
setTimeout(wait_qt, 100);
}
})();
"""
def main():
manager = Manager()
manager.set_script(JS)
manager.start(
"https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all"
)
soup = bs.BeautifulSoup(manager.html, "html.parser")
section = soup.find("table", {"id": "DataTables_Table_0"})
pprint(section)
if __name__ == "__main__":
main()

How would i modify my code to avoid this error : PyQt: RuntimeError: wrapped C/C++ object has been deleted [duplicate]

I'm using Qt's QWebPage to render a page that uses javascript to update its content dynamically - so a library that just downloads a static version of the page (such as urllib2) won't work.
My problem is, when I render a second page, about 99% of the time the program just crashes. At other times, it will work three times before crashing. I've also gotten a few segfaults, but it is all very random.
My guess is the object I'm using to render isn't getting deleted properly, so trying to reuse it is possibly causing some problems for myself. I've looked all over and no one really seems to be having this same issue.
Here's the code I'm using. The program downloads web pages from steam's community market so I can create a database of all the items. I need to call the getItemsFromPage function multiple times to get all of the items, as they are broken up into pages (showing results 1-10 out of X amount).
import csv
import re
import sys
from string import replace
from bs4 import BeautifulSoup
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Item:
__slots__ = ("name", "count", "price", "game")
def __repr__(self):
return self.name + "(" + str(self.count) + ")"
def __str__(self):
return self.name + ", " + str(self.count) + ", $" + str(self.price)
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
self.deleteLater()
def getItemsFromPage(appid, page=1):
r = Render("http://steamcommunity.com/market/search?q=appid:" + str(appid) + "#p" + str(page))
soup = BeautifulSoup(str(r.frame.toHtml().toUtf8()))
itemLst = soup.find_all("div", "market_listing_row market_recent_listing_row")
items = []
for k in itemLst:
i = Item()
i.name = k.find("span", "market_listing_item_name").string
i.count = int(replace(k.find("span", "market_listing_num_listings_qty").string, ",", ""))
i.price = float(re.search(r'\$([0-9]+\.[0-9]+)', str(k)).group(1))
i.game = appid
items.append(i)
return items
if __name__ == "__main__":
print "Updating market items to dota2.csv ..."
i = 1
with open("dota2.csv", "w") as f:
writer = csv.writer(f)
r = None
while True:
print "Page " + str(i)
items = getItemsFromPage(570)
if len(items) == 0:
print "No items found, stopping..."
break
for k in items:
writer.writerow((k.name, k.count, k.price, k.game))
i += 1
print "Done."
Calling getItemsFromPage once works fine. Subsequent calls give me my problem. The output of the program is typically
Updating market items to dota2.csv ...
Page 1
Page 2
and then it crashes. It should go on for over 700 pages.

The problem with your program is that you are attempting to create a new QApplication with every url you fetch.
Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.
Here are some examples which show how to use the WebPage class:
Usage:
def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))
import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)
# example 1: process list of urls
urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)
# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))
sys.exit(app.exec_())
PyQt5 WebPage:
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)
def process(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True
def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)
PyQt4 WebPage:
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True
def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)

Python Dynamic web scraping using PyQT4: got stuck at exec() [duplicate]

I'm using Qt's QWebPage to render a page that uses javascript to update its content dynamically - so a library that just downloads a static version of the page (such as urllib2) won't work.
My problem is, when I render a second page, about 99% of the time the program just crashes. At other times, it will work three times before crashing. I've also gotten a few segfaults, but it is all very random.
My guess is the object I'm using to render isn't getting deleted properly, so trying to reuse it is possibly causing some problems for myself. I've looked all over and no one really seems to be having this same issue.
Here's the code I'm using. The program downloads web pages from steam's community market so I can create a database of all the items. I need to call the getItemsFromPage function multiple times to get all of the items, as they are broken up into pages (showing results 1-10 out of X amount).
import csv
import re
import sys
from string import replace
from bs4 import BeautifulSoup
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Item:
__slots__ = ("name", "count", "price", "game")
def __repr__(self):
return self.name + "(" + str(self.count) + ")"
def __str__(self):
return self.name + ", " + str(self.count) + ", $" + str(self.price)
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
self.deleteLater()
def getItemsFromPage(appid, page=1):
r = Render("http://steamcommunity.com/market/search?q=appid:" + str(appid) + "#p" + str(page))
soup = BeautifulSoup(str(r.frame.toHtml().toUtf8()))
itemLst = soup.find_all("div", "market_listing_row market_recent_listing_row")
items = []
for k in itemLst:
i = Item()
i.name = k.find("span", "market_listing_item_name").string
i.count = int(replace(k.find("span", "market_listing_num_listings_qty").string, ",", ""))
i.price = float(re.search(r'\$([0-9]+\.[0-9]+)', str(k)).group(1))
i.game = appid
items.append(i)
return items
if __name__ == "__main__":
print "Updating market items to dota2.csv ..."
i = 1
with open("dota2.csv", "w") as f:
writer = csv.writer(f)
r = None
while True:
print "Page " + str(i)
items = getItemsFromPage(570)
if len(items) == 0:
print "No items found, stopping..."
break
for k in items:
writer.writerow((k.name, k.count, k.price, k.game))
i += 1
print "Done."
Calling getItemsFromPage once works fine. Subsequent calls give me my problem. The output of the program is typically
Updating market items to dota2.csv ...
Page 1
Page 2
and then it crashes. It should go on for over 700 pages.

The problem with your program is that you are attempting to create a new QApplication with every url you fetch.
Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.
Here are some examples which show how to use the WebPage class:
Usage:
def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))
import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)
# example 1: process list of urls
urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)
# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))
sys.exit(app.exec_())
PyQt5 WebPage:
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)
def process(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True
def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)
PyQt4 WebPage:
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True
def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrape multiple urls using QWebPage - python

Related

Printing several PDFs using PyQt5 [duplicate]

How to make PyQt5 wait n seconds before returning page

PyQt5 returns None value when loading Web page content

How would i modify my code to avoid this error : PyQt: RuntimeError: wrapped C/C++ object has been deleted [duplicate]

Python Dynamic web scraping using PyQT4: got stuck at exec() [duplicate]

Categories

Resources