Why am I just taking the picture url from the last url?

Why am I just taking the picture url from the last url? - python

I wrote a program to extract link pictures from webcomics, however, when I run it, it just extracts image links from the last link chapter, not all image links from all chapters. What is the issue with my program?
I have tried several ways but not things useful.
from PyQt5 import QtNetwork, QtCore
from requests_html import HTML
from functools import cached_property
from PyQt5.QtCore import QCoreApplication, QUrl
url1 = "https://saytruyen.net/truyen-su-tro-lai-cua-phap-su-hac-am-sau-66666-nam.html"
class Manager:
def __init__(self):
self.manager.finished.connect(self.handle_response)
#cached_property
def manager(self):
return QtNetwork.QNetworkAccessManager()
def start(self):
self.start_request(QtCore.QUrl(url1))
def start_request(self, url):
request = QtNetwork.QNetworkRequest(url)
self.manager.get(request)
def handle_response(self, reply):
err = reply.error()
if err == QtNetwork.QNetworkReply.NoError:
self.process(str(reply.readAll(), 'utf-8'))
else:
print("Error occured: ", err)
print(reply.errorString())
def process(self, data):
html = HTML(html=data)
rs = html.find("#list-chapter a", first=False)
for i in reversed(rs):
url2 = "https://saytruyen.net/" + i.attrs["href"]
#print(url2)
#self.start_request(QtCore.QUrl(url2))
req = QtNetwork.QNetworkRequest(QUrl(url2))
self.nam = QtNetwork.QNetworkAccessManager()
self.nam.finished.connect(self.handleResponse)
self.nam.get(req)
def handleResponse(self, reply):
er = reply.error()
if er == QtNetwork.QNetworkReply.NoError:
bytes_string = reply.readAll()
html2 = HTML(html = str(bytes_string, 'utf-8'))
rs_c = html2.find("#lst_content img")
for x in rs_c:
img ="https://saytruyen.net/" + x.attrs['src']
print(img)
else:
print("Error occured: ", er)
print(reply.errorString())
QCoreApplication.quit()

There are two problems:
the QNetworkAccessManager used for the download is being continuously recreated; since a network request is asynchronous, it isn't processed instantly, and it will be destroyed along with the network manager in the next cycle of the for loop since it's being overwritten; the result is that the previous request will be destroyed and only the last one will "survive";
the application is quit as soon as the first reply is received, preventing all other requests to be processed;
The solution is to create a single manager for the download process in the __init__, and quit as soon as all requests have been received.
class Manager:
def __init__(self):
self.manager.finished.connect(self.handle_response)
self.nam = QtNetwork.QNetworkAccessManager()
self.nam.finished.connect(self.handleResponse)
self.urls = set()
# ...
def process(self, data):
html = HTML(html=data)
rs = html.find("#list-chapter a", first=False)
for i in reversed(rs):
url2 = QUrl("https://saytruyen.net/" + i.attrs["href"])
if url2 in self.urls:
continue
self.urls.add(url2)
req = QtNetwork.QNetworkRequest(url2)
self.nam.get(req)
def handleResponse(self, reply):
self.urls.discard(reply.url())
er = reply.error()
if er == QtNetwork.QNetworkReply.NoError:
bytes_string = reply.readAll()
html2 = HTML(html = str(bytes_string, 'utf-8'))
rs_c = html2.find("#lst_content img")
for x in rs_c:
img ="https://saytruyen.net/" + x.attrs['src']
print(img)
else:
print("Error occured: ", er)
print(reply.errorString())
if not self.urls:
QCoreApplication.quit()
Note that it's usually enough (and better) to have a single network manager and properly handle responses based on queued requests, but for simple situations like this one having two managers doesn't represent a huge problem.

Related

Python WebScraping closes without finishing and without giving error

I'm making a simple WebScraping that download the image of the items of some champions of a site, I put a "for" with 5 characters and it only executes 2 of them and then closes without giving any error!
import bs4 as bs
import sys,os
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
print("#1 __init__")
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('#2 On Load finished')
def Callable(self, html_str):
print("#3 Callable\n")
self.html = html_str
self.app.quit()
def already_exist(image_name):
for _, _, folder in os.walk('Images'):
if image_name in folder:
return False
else:
return True
def ImageDownload(url):
image_name = url.split("/")
try:
if already_exist(image_name[-1]):
full_path = "Images/" + image_name[-1]
urllib.request.urlretrieve(url, full_path)
print("Download %s" % image_name)
else:
print("Image already Downloaded >: %s" % image_name[-1])
except:
print("Error Download")
def main():
champions = ['Amumu','Akali','Zed','Nunu'] #champions
for champ in champions:
try:
print("\nDownloading Images >: %s"% champ)
data = Page('https://www.probuilds.net/champions/details/%s' % champ.strip())
soup = bs.BeautifulSoup(data.html, 'html.parser')
items = soup.find_all('div',{'class':'items'})
for photos in items:
images = photos.find_all('img')
for image in images:
ImageDownload(image['src'])
except:
print("Shi...")
main()
i'm getting no error but the program only executes 2 times this is the problem, someone help me !!!

What it seems is that the QWebEnginePage does not close correctly, it is also advisable to reuse instead of creating another QWebEnginePage, so using an old answer as a basis I have implemented the following solution:
import os
import sys
import bs4 as bs
import urllib.request
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
self.process(self.url(), html)
if not self.fetchNext():
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def process(self, url, html):
print('loaded: [%d chars] %s' % (len(html), url.toString()))
class ScrapePage(WebPage):
def __init__(self):
super(ScrapePage, self).__init__()
self.results = set()
def process(self, url, html):
soup = bs.BeautifulSoup(html, 'html.parser')
items = soup.find_all('div',{'class':'items'})
for photos in items:
images = photos.find_all('img')
for image in images:
self.results.add(image['src'])
def already_exist(image_name):
for _, _, folder in os.walk('Images'):
if image_name in folder:
return False
else:
return True
def ImageDownload(url):
image_name = url.split("/")
try:
if already_exist(image_name[-1]):
full_path = "Images/" + image_name[-1]
urllib.request.urlretrieve(url, full_path)
print("Download %s" % image_name)
else:
print("Image already Downloaded >: %s" % image_name[-1])
except:
print("Error Download")
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
webpage = ScrapePage()
champions = ['Amumu','Akali','Zed','Nunu']
base_url = 'https://www.probuilds.net/champions/details/'
urls = []
for champ in champions:
url = QtCore.QUrl(base_url).resolved(QtCore.QUrl(champ))
urls.append(url)
webpage.start(urls)
app.exec_()
for url in webpage.results:
ImageDownload(url)

Accessing a classes variable in python

I understand that this is a duplicate, but I havent had that "ah-ha" moment where I understand HOW to access the a classes variable. In this code, I am crawling a website from a list of thousands of pages. Those jobs are submitted via concurrent.futures.
I want to be able to return the value of "results". I've used self.results within def __init__(self, url_list, threads) and I cant seem to pull that variable when I try print(example.results.
If self.results is returning a value, but example.results isn't pulling it from if __name__ == '__main__':, how can you access that? I know I've done something wrong, but I don't know what it is.
from concurrent.futures import ThreadPoolExecutor
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
class ConcurrentListCrawler(object):
def __init__(self, url_list, threads):
self.urls = url_list
self.results = {}
self.max_threads = threads
def __make_request(self, url):
try:
r = requests.get(url=url, timeout=20)
r.raise_for_status()
print(countit(), r.url)
except requests.exceptions.Timeout:
r = requests.get(url=url, timeout=60)
except requests.exceptions.ConnectionError:
r = requests.get(url=url, timeout=60)
except requests.exceptions.RequestException as e:
raise e
return r.url, r.text
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results = trip_data
#print(self.results)
return self.results
def wrapper(self, url):
url, html = self.__make_request(url)
self.__parse_results(url, html)
def run_script(self):
with ThreadPoolExecutor(max_workers=min(len(self.urls),self.max_threads)) as Executor:
jobs = [Executor.submit(self.wrapper, u) for u in self.urls]
if __name__ == '__main__':
listo = loadit()
print(listo)
print(len(listo))
example = ConcurrentListCrawler(listo, 10)
example.run_script()
print(example.results)
Any pointers would be greatly appreciated.

I believe one of your methods is not returning the results.
Make the following change.
def wrapper(self, url):
url, html = self.__make_request(url)
return self.__parse_results(url, html)
After this, I suggest you utilize the self.results as a dictionary, like it was declared.
In the method "__parse_results(..)", append trip_data to self.results as follows, instead of assigning.
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results[url] = trip_data
#print(self.results)
return self.results
When you append to self.results, it would retain the older values and you may avoid replacing by reassignment.

The issue was that I submitted all the jobs at once through a list. I was unable to pull the variable from the class because print(example.results) because that part of the code isnt access until all jobs are complete. With that I was able to resolve by getting rid of the class (even though the title of this posting indicates that this is the issue).
from concurrent.futures import ThreadPoolExecutor
import concurrent
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
def load_url(url):
try:
print(countit(), url)
trip_data = restaurant_parse(url)
return trip_data
except Exception as e:
raise e
if __name__ == '__main__':
URLs = loadit()
#print(URLs)
#print(len(URLs))
with ThreadPoolExecutor(max_workers=10) as executor:
# start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url): url for url in URLs}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
print('this is data', data)
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
Here, I can pull the dictionary by grabbing data.
Thanks for the help, everyone.

How to download a file using PySide/PyQt from a website

I have this code that receives all of the networking resources of a web page.
I took this code from this site so I don't know how it works but I know that it receives all of the networking resources of a web page, which is what I need.
This is my code:
import sys, time
from PySide.QtCore import QUrl, SIGNAL
from PySide.QtGui import QApplication
from PySide.QtWebKit import QWebPage, QWebView, QWebSettings
from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest
#reload(sys)
#sys.setdefaultencoding('utf-8')
fn_log = 'url_dd.txt'
fp_log = open(fn_log, 'ab+')
class WebPage(QWebPage):
def __init__(self, logger=None, parent=None):
super(WebPage, self).__init__(parent)
def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
sys.stderr.write('Javascritp error at line number %d\n' % (lineNumber))
sys.stderr.write('%s\n' % (message, ))
sys.stderr.write('Source ID: %s\n' % (sourceID, ))
class Crawler(QApplication):
def __init__(self, url):
super(Crawler, self).__init__(sys.argv)
self.url = url
self.web_view = QWebView()
self.web_page = WebPage()
self.web_view.setPage(self.web_page)
self.web_frame = self.web_page.mainFrame()
self.network = NetworkAccessManager()
self.web_page.setNetworkAccessManager(self.network)
self.settings = self.web_page.settings().globalSettings()
self.settings.setAttribute(QWebSettings.PluginsEnabled, False)
QWebSettings.clearMemoryCaches()
self.web_view.resize(1024, 9000)
self.connect(self.web_page, SIGNAL('loadFinished(bool)'), self.loadFinished)
print('Before loading')
self.web_view.load(QUrl(self.url))
print('After loading')
def loadFinished(self, ok):
print('Start loadFinished()')
print('Start writing')
#with open('content_dd.txt', 'ab+') as fp:
#fp.write(self.web_frame.toHtml().toUtf8())
print('End writing')
print('End loadFinished()')
try:
self.quit()
except Exception as e:
print('FATAL ERROR: %s' % (str(e)))
class NetworkAccessManager(QNetworkAccessManager):
def __init__(self):
super(NetworkAccessManager, self).__init__()
# QNetworkAccessManager.__init__(self)
self.connect(self, SIGNAL('finished (QNetworkReply *)'), self.finishd)
def createRequest(self, operation, request, data):
# url = request.url().toString()
self.setNetworkAccessible(self.Accessible)
return QNetworkAccessManager.createRequest(self, operation, request, data)
def finishd(self, reply):
print('In NetworkAccessManager finishd')
url = str(reply.url().toString())
log = '%s: %s\n' % (time.ctime(), url)
#fp_log.write(log)
print(reply)
print(reply.request())
print(log)
print(url)
if __name__ == '__main__':
url = 'http://need4bit.com'
crawler = Crawler(url)
sys.exit(crawler.exec_())
How should I modify this code so it could save all the resources into a directory.

How do I use multiple threads with PySide and Twython?

I'm trying to write a small python app, using PySide for the GUI and Twython as a Twitter API library, to catch a stream from Twitter.
The problem that I am having is that when I click "Start Monitoring Twitter" button, the UI freezes until the stream is complete, at which point the code continues to execute and disables the Start button and enables the Stop button. Here's the UI:
Everything else seems to work -- if I leave it, then the CSV file is created as I suspect -- the Twython components seem to be working as expected.
Line 151 is where the streaming from Twitter is engaged when I click start:
self.stream.statuses.filter(track=self.search_term)
How can I move the streaming to a separate thread and then use the Stop button on the UI to tell Twython to complete capturing the stream and exit?
I need to be able to send the MyStreamer instance to another thread and then send it the .disconnect() signal to have it terminate capturing the stream.
Here's the full code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import platform
import PySide
from PySide.QtGui import QApplication, QMainWindow, QPushButton, QCheckBox, QTextEdit
from time import sleep
from ui_tweetstream import Ui_MainWindow
from twython import Twython
from twython import TwythonStreamer
import csv
class MainWindow(QMainWindow, Ui_MainWindow):
def __init__(self, parent=None):
super(MainWindow, self).__init__(parent)
self.setupUi(self)
# Set up Variables
self.tweet_fav_count = True
self.tweet_geocoordinates = True
self.tweet_id = True
self.tweet_language = True
self.tweet_orig_tweet_id = True
self.tweet_orig_username = True
self.tweet_retweeted = True
self.tweet_sensitive = True
self.tweet_source_app = True
self.tweet_timestamp = True
self.tweet_user_name = True
self.search_term = "#bigdata"
self.tweets_to_get = 1000
# Bind the interface
self.check_tweet_fav_count.clicked.connect(self.setTweetFavCount)
self.check_tweet_geocoordinates.clicked.connect(self.setTweetGeocoordinates)
self.check_tweet_id.clicked.connect(self.setTweetID)
self.check_tweet_language.clicked.connect(self.setTweetLanguage)
self.check_tweet_orig_tweet_id.clicked.connect(self.setTweetOrigTweetID)
self.check_tweet_orig_username.clicked.connect(self.setTweetOrigUsername)
self.check_tweet_retweeted.clicked.connect(self.setTweetRetweeted)
self.check_tweet_sensitive.clicked.connect(self.setTweetSensitive)
self.check_tweet_source_app.clicked.connect(self.setTweetSourceApp)
self.check_tweet_timestamp.clicked.connect(self.setTweetTimestamp)
self.check_tweet_user_name.clicked.connect(self.setTweetUsername)
self.button_start.clicked.connect(self.streamStart)
self.button_stop.clicked.connect(self.streamStop)
# Set the initial states
self.button_stop.setEnabled(False)
APP_KEY = ''
APP_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''
self.t = Twython(APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
self.stream = MyStreamer(APP_KEY,APP_SECRET,OAUTH_TOKEN,OAUTH_TOKEN_SECRET)
self.stream.init_mainWindow(self)
def streamStop(self):
print "Stopping stream"
# Enable other controls here
self.button_stop.setEnabled(False)
self.button_start.setEnabled(True)
self.setControlStates(True)
self.stream.stopStream()
def setControlStates(self, state):
self.check_tweet_fav_count.setEnabled(state)
self.check_tweet_geocoordinates.setEnabled(state)
self.check_tweet_id.setEnabled(state)
self.check_tweet_language.setEnabled(state)
self.check_tweet_orig_tweet_id.setEnabled(state)
self.check_tweet_orig_username.setEnabled(state)
self.check_tweet_retweeted.setEnabled(state)
self.check_tweet_sensitive.setEnabled(state)
self.check_tweet_source_app.setEnabled(state)
self.check_tweet_timestamp.setEnabled(state)
self.check_tweet_user_name.setEnabled(state)
self.search_box.setEnabled(state)
self.num_tweets_box.setEnabled(state)
# Functions for determining what to track
def setTweetFavCount(self):
self.tweet_fav_count = not self.tweet_fav_count
print "tweet_fav_count:", self.tweet_fav_count
def setTweetGeocoordinates(self):
self.tweet_geocoordinates = not self.tweet_geocoordinates
print "tweet_geocoordinates:", self.tweet_geocoordinates
def setTweetID(self):
self.tweet_id = not self.tweet_id
print "tweet_id:", self.tweet_id
def setTweetLanguage(self):
self.tweet_language = not self.tweet_language
print "tweet_language:", self.tweet_language
def setTweetOrigTweetID(self):
self.tweet_orig_tweet_id = not self.tweet_orig_tweet_id
print "tweet_orig_tweet_id:", self.tweet_orig_tweet_id
def setTweetOrigUsername(self):
self.tweet_orig_username = not self.tweet_orig_tweet_id
print "tweet_orig_username:", self. tweet_orig_username
def setTweetRetweeted(self):
self.tweet_retweeted = not self.tweet_retweeted
print "tweet_retweeted:", self.tweet_retweeted
def setTweetSensitive(self):
self.tweet_sensitive = not self.tweet_sensitive
print "tweet_sensitive:", self.tweet_sensitive
def setTweetSourceApp(self):
self.tweet_source_app = not self.tweet_source_app
print "tweet_source_app:", self.tweet_source_app
def setTweetTimestamp(self):
self.tweet_timestamp = not self.tweet_timestamp
print "tweet_timestamp:", self.tweet_timestamp
def setTweetUsername(self):
self.tweet_user_name = not self.tweet_user_name
print "tweet_user_name:", self.tweet_user_name
# Functions for starting and stopping the stream
def streamStart(self):
print "Starting stream"
self.setControlStates(False)
# Disable other controls here
self.button_start.setEnabled(False)
self.button_stop.setEnabled(True)
# Hack to try to disable the UI
# sleep(0.25)
# Get the active search term
self.search_term = self.search_box.text()
# Get the number of tweets
self.tweets_to_get = int(self.num_tweets_box.text())
# Set the streamer
self.stream.set_start_criteria(self.tweets_to_get)
self.stream.statuses.filter(track=self.search_term)
class MyStreamer(TwythonStreamer):
def init_mainWindow(self, the_main_window):
self.main_window = the_main_window
self.stop = False
self.header_done = False
def set_start_criteria(self, numTweets):
self.maxTweets = numTweets
self.tweetCount = 0
print "Number of tweets to get:", self.maxTweets
def stopStream(self):
self.stop = True
def on_success(self, data):
if 'text' in data:
self.tweetCount += 1
print "tweetCount:", self.tweetCount
#tweet = data['text'].encode('utf-8')
theTweet = data
writer = TweetMonkey()
writer.assignMainWindow(self.main_window, self.header_done)
self.header_done = True
writer.process(theTweet)
# Want to disconnect after the first result?
if self.stop is True or self.tweetCount >= self.maxTweets:
self.disconnect()
def on_error(self, status_code, data):
print status_code, data
class TweetMonkey:
def assignMainWindow(self,the_main_window, is_header_done):
self.main_window = the_main_window
self.header_done = is_header_done
def clean(self,text):
text = text.replace("\n","; ")
text = text.replace('"', "'")
text = text.replace(','," ")
return text
def create_header(self):
header = []
tweets = open("tweets.csv", 'ab+')
wr = csv.writer(tweets, dialect='excel')
if self.main_window.tweet_id is True:
header.append("id")
if self.main_window.tweet_language is True:
header.append("lang")
if self.main_window.tweet_user_name is True:
header.append("user_name")
header.append("tweet")
if self.main_window.tweet_retweeted is True:
header.append("retweeted")
if self.main_window.tweet_fav_count is True:
header.append("favorite_count")
if self.main_window.tweet_source_app is True:
header.append("source")
if self.main_window.tweet_orig_tweet_id is True:
header.append("in_reply_to_status_id")
if self.main_window.tweet_orig_username is True:
header.append("in_reply_to_screen_name")
# header.append("in_reply_to_user_id")
if self.main_window.tweet_sensitive is True:
header.append("possibly_sensitive")
if self.main_window.tweet_geocoordinates is True:
header.append("geo")
if self.main_window.tweet_timestamp is True:
header.append("created_at")
wr.writerow(header)
tweets.close()
def process(self, tweet):
if not self.header_done:
self.create_header()
self.header_done = True
# Create the file or append to the existing
theOutput = []
tweets = open("tweets.csv", 'ab+')
wr = csv.writer(tweets, dialect='excel')
if self.main_window.tweet_id is True:
theOutput.append(tweet['id'])
if self.main_window.tweet_language is True:
theOutput.append(tweet['lang'].encode('utf-8'))
if self.main_window.tweet_user_name is True:
theOutput.append(tweet['user']['name'].encode('utf-8', 'replace'))
theOutput.append(self.clean(tweet['text']).encode('utf-8', 'replace'))
if self.main_window.tweet_retweeted is True:
theOutput.append(tweet['retweeted'])
if self.main_window.tweet_fav_count is True:
theOutput.append(tweet['favorite_count'])
if self.main_window.tweet_source_app is True:
theOutput.append(self.clean(tweet['source']).encode('utf-8', 'replace'))
if self.main_window.tweet_orig_tweet_id is True:
theOutput.append(tweet['in_reply_to_status_id'])
if self.main_window.tweet_orig_username is True:
theOutput.append(tweet['in_reply_to_screen_name'])
#theOutput.append(tweet['in_reply_to_user_id'])
if self.main_window.tweet_sensitive is True:
if tweet.get('possibly_sensitive'):
theOutput.append(tweet['possibly_sensitive'])
else:
theOutput.append("False")
if self.main_window.tweet_geocoordinates is True:
if tweet['geo'] is not None:
if tweet['geo']['type'] == 'Point':
lat = str(tweet['geo']['coordinates'][0]) + " "
lon = str(tweet['geo']['coordinates'][1])
theOutput.append(lat + lon)
else:
theOutput.append(tweet['geo'])
else:
theOutput.append(tweet['geo'])
if self.main_window.tweet_timestamp is True:
theOutput.append(tweet['created_at'])
wr.writerow(theOutput)
tweets.close()
if __name__ == '__main__':
app = QApplication(sys.argv)
frame = MainWindow()
frame.show()
app.exec_()

I know this is an old post but I ran into a similar problem in a simple app I recently wrote, my solution was to use threading.
I used the worker from:
https://pymotw.com/2/threading/
and the method described in:
http://aadrake.com/using-twitter-as-a-stream-processing-source.html
Basically running the Twython stream as a separate thread feeding text to a queue then I run the rest of the program in a separate loop reading from the queue.

Multi-threaded Python Web Crawler Got Stuck

I'm writing a Python web crawler and I want to make it multi-threaded. Now I have finished the basic part, below is what it does:
a thread gets a url from the queue;
the thread extracts the links from the page, checks if the links exist in a pool (a set), and puts the new links to the queue and the pool;
the thread writes the url and the http response to a csv file.
But when I run the crawler, it always gets stuck eventually, not exiting properly. I have gone through the official document of Python but still have no clue.
Below is the code:
#!/usr/bin/env python
#!coding=utf-8
import requests, re, urlparse
import threading
from Queue import Queue
from bs4 import BeautifulSoup
#custom modules and files
from setting import config
class Page:
def __init__(self, url):
self.url = url
self.status = ""
self.rawdata = ""
self.error = False
r = ""
try:
r = requests.get(self.url, headers={'User-Agent': 'random spider'})
except requests.exceptions.RequestException as e:
self.status = e
self.error = True
else:
if not r.history:
self.status = r.status_code
else:
self.status = r.history[0]
self.rawdata = r
def outlinks(self):
self.outlinks = []
#links, contains URL, anchor text, nofollow
raw = self.rawdata.text.lower()
soup = BeautifulSoup(raw)
outlinks = soup.find_all('a', href=True)
for link in outlinks:
d = {"follow":"yes"}
d['url'] = urlparse.urljoin(self.url, link.get('href'))
d['anchortext'] = link.text
if link.get('rel'):
if "nofollow" in link.get('rel'):
d["follow"] = "no"
if d not in self.outlinks:
self.outlinks.append(d)
pool = Queue()
exist = set()
thread_num = 10
lock = threading.Lock()
output = open("final.csv", "a")
#the domain is the start point
domain = config["domain"]
pool.put(domain)
exist.add(domain)
def crawl():
while True:
p = Page(pool.get())
#write data to output file
lock.acquire()
output.write(p.url+" "+str(p.status)+"\n")
print "%s crawls %s" % (threading.currentThread().getName(), p.url)
lock.release()
if not p.error:
p.outlinks()
outlinks = p.outlinks
if urlparse.urlparse(p.url)[1] == urlparse.urlparse(domain)[1] :
for link in outlinks:
if link['url'] not in exist:
lock.acquire()
pool.put(link['url'])
exist.add(link['url'])
lock.release()
pool.task_done()
for i in range(thread_num):
t = threading.Thread(target = crawl)
t.start()
pool.join()
output.close()
Any help would be appreciated!
Thanks
Marcus

Your crawl function has an infinite while loop with no possible exit path.
The condition True always evaluates to True and the loop continues, as you say,
not exiting properly
Modify the crawl function's while loop to include a condition. For instance, when the number of links saved to the csv file exceeds a certain minimum number, then exit the while loop.
i.e.,
def crawl():
while len(exist) <= min_links:
...

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Why am I just taking the picture url from the last url? - python

Related

Python WebScraping closes without finishing and without giving error

Accessing a classes variable in python

How to download a file using PySide/PyQt from a website

How do I use multiple threads with PySide and Twython?

Multi-threaded Python Web Crawler Got Stuck

Categories

Resources