Effectively requesting and processing multiple HTML files with Python - python

I am writing a tool which fetches multiple HTML files and processes them as text:
for url in url_list:
url_response = requests.get(url)
text = url_response.text
# Process text here (put in database, search, etc)
The problem is that this is pretty slow. If I just needed a simple resonse I could have used grequests, but since I need to get the content of the HTML file, that seems not to be an option. How can I fasten this up?
Thanks in regard!

import requests
from multiprocessing import Pool
def process_html(url):
url_response = requests.get(url)
text = url_response.text
print(text[:500])
print('-' * 30)
urls = [
'http://www.apple.com',
'http://www.yahoo.com',
'http://www.google.com',
'http://www.apple.com',
'http://www.yahoo.com',
'http://www.google.com',
'http://www.apple.com',
'http://www.yahoo.com',
'http://www.google.com',
]
with Pool(None) as p: #None => uses cpu.count()
p.map(process_html, urls) #This blocks until all return values from process_html() have been collected.

Use a thread for each request:
import threading
import urllib2
url_list = ["url1", "url2"]
def fetch_url(url):
url_response = requests.get(url)
text = url_response.text
threads = [threading.Thread(target=fetch_url, args=(url,)) for url in url_list]
for thread in threads:
thread.start()
for thread in threads:
thread.join()

You need to use threading and put requests.get(...) to fetch each URL in diff threads i.e. in parallel.
See these two answers on SO for example and usage:
Python - very simple multithreading parallel URL fetching (without queue)
Multiple requests using urllib2.urlopen() at the same time

import threading
import urllib2
url_list = ["url1", "url2"]
def fetch_url(url):
url_response = requests.get(url)
text = url_response.text
threads = [threading.Thread(target=fetch_url, args=(url,)) for url in url_list]
for thread in threads:
thread.start()
for thread in threads:
thread.join()

Related

How to remove a URL monitoring while script is running?

I have written a script where I am doing a monitor on some webpages and whenever there is a specific html tag found, it should print a notification. The point is to run the script 24/7 and while the script is running, I want to remove URL. I have currently a database where I am going to read the URLS that is being found/removed.
import threading
import requests
from bs4 import BeautifulSoup
# Replacement for database for now
URLS = [
'https://github.com/search?q=hello+world',
'https://github.com/search?q=python+3',
'https://github.com/search?q=world',
'https://github.com/search?q=i+love+python',
]
def doRequest(url):
while True:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
if soup.find("span", {"data-search-type": "Repositories"}).text.strip(): # if there are sizes
sendNotifications({
'title': soup.find("input", {"name": "q"})['value'],
'repo_count': soup.find("span", {"data-search-type": "Repositories"}).text.strip()
})
else:
print(url, response.status_code)
def sendNotifications(data):
...
if __name__ == '__main__':
# TODO read URLS from database instead of lists
for url in URLS:
threading.Thread(target=doRequest, args=(url,)).start()
The current problem im facing is that the doRequest is in a while loop which is running all the time and I wonder how can I remove a specific URL while the script is running inside a runnable script? e.g. https://github.com/search?q=world
Method 1: A simple approach
What you want is to insert some termination logic in the while True loop so that it constantly checks for a termination signal.
To this end, you can use threading.Event().
For example, you can add a stopping_event argument:
def doRequest(url, stopping_event):
while True and not stopping_event.is_set():
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
if soup.find("span", {"data-search-type": "Repositories"}).text.strip(): # if there are sizes
sendNotifications({
'title': soup.find("input", {"name": "q"})['value'],
'repo_count': soup.find("span", {"data-search-type": "Repositories"}).text.strip()
})
else:
print(url, response.status_code)
And you create these events when starting the threads
if __name__ == '__main__':
# TODO read URLS from database instead of lists
stopping_events = {url: threading.Event() for url in URLS}
for url in URLS:
threading.Thread(target=doRequest, args=(url, stopping_events[url])).start()
Whenever you want to stop/remove a particular url, you can just call
stopping_events[url].set()
That particular while loop will stop and exit.
You can even create a separate thread that waits for an user input to stop a particular url:
def manager(stopping_events):
while True:
url = input('url to stop: ')
if url in stopping_events:
stopping_events[url].set()
if __name__ == '__main__':
# TODO read URLS from database instead of lists
stopping_events = {url: threading.Event() for url in URLS}
for url in URLS:
threading.Thread(target=doRequest, args=(url, stopping_events[url])).start()
threading.Thread(target=manager, args=(stopping_events,)).start()
Method 2: A cleaner approach
Instead of having a fixed list of URLs, you can have a thread that keeps reading the list of URLs and feed it to the processing threads. This is the Producer-Consumer pattern. Now you don't really remove any URL. You simply keep processing the later list of URLs from the database. That should automatically take care of newly added/deleted URLs.
import queue
import threading
import requests
from bs4 import BeautifulSoup
# Replacement for database for now
def get_urls_from_db(q: queue.Queue):
while True:
url_list = ... # some db read logic
map(q.put, url_list) # putting newly read URLs into queue
def doRequest(q: queue.Queue):
while True:
url = q.get() # waiting and getting url from queue
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
if soup.find("span", {"data-search-type": "Repositories"}).text.strip(): # if there are sizes
sendNotifications({
'title': soup.find("input", {"name": "q"})['value'],
'repo_count': soup.find("span", {"data-search-type": "Repositories"}).text.strip()
})
else:
print(url, response.status_code)
def sendNotifications(data):
...
if __name__ == '__main__':
# TODO read URLS from database instead of lists
url_queue = queue.Queue()
for _ in range(10): # starts 10 threads
threading.Thread(target=doRequest, args=(url_queue,)).start()
threading.Thread(target=get_urls_from_db, args=(url_queue,)).start()
get_urls_from_db keeps reading URLs from database and adds the current list of URLs from database to the url_queue to be processed.
In doRequest, each iteration of the loop now grabs one url from the url_queue and processes it.
One thing to watch out for is adding URLs too quickly and processing can't keep up. Then the queue length will grow over time and consume lots of memory.
This is arguably better since now you do have great control over what URLs to process and have a fixed number of threads.

threading: function seems to run as a blocking loop although i am using threading

I am trying to speed up web scraping by running my http requests in a ThreadPoolExecutor from the concurrent.futures library.
Here is the code:
import concurrent.futures
import requests
from bs4 import BeautifulSoup
urls = [
'https://www.interactivebrokers.eu/en/index.php?f=41295&exch=ibfxcfd&showcategories=CFD',
'https://www.interactivebrokers.eu/en/index.php?f=41634&exch=chix_ca',
'https://www.interactivebrokers.eu/en/index.php?f=41634&exch=tase',
'https://www.interactivebrokers.eu/en/index.php?f=41295&exch=chixen-be&showcategories=STK',
'https://www.interactivebrokers.eu/en/index.php?f=41295&exch=bvme&showcategories=STK'
]
def get_url(url):
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
a = soup.select_one('a')
print(a)
with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
results = {executor.submit( get_url(url)) : url for url in urls}
for future in concurrent.futures.as_completed(results):
try:
pass
except Exception as exc:
print('ERROR for symbol:', results[future])
print(exc)
However when looking at how the scripts print in the CLI, it seems that the requests are sent in a blocking loop.
Additionaly if i run the code by using the below, i an see that it is taking roughly the same time.
for u in urls:
get_url(u)
I have add some success in implementing concurrency using that library before, and i am at loss regarding what is going wrong here.
I am aware of the existence of the asyncio library as an alternative, but I would be keen on using threading instead.
You're not actually running your get_url calls as tasks; you call them in the main thread, and pass the result to executor.submit, experiencing the concurrent.futures analog to this problem with raw threading.Thread usage. Change:
results = {executor.submit( get_url(url)) : url for url in urls}
to:
results = {executor.submit(get_url, url) : url for url in urls}
so you pass the function to call and its arguments to the submit call (which then runs them in threads for you) and it should parallelize your code.

Fastest way to read and process 100,000 URLs in Python

I have a file with 100,000 URLs that I need to request then process. The processing takes a non-negligible amount of time compared to the request, so simply using multithreading seems to only give me a partial speed-up. From what I have read, I think using the multiprocessing module, or something similar, would offer a more substantial speed-up because I could use multiple cores. I'm guessing I want to use some multiple processes, each with multiple threads, but I'm not sure how to do that.
Here is my current code, using threading (based on What is the fastest way to send 100,000 HTTP requests in Python?):
from threading import Thread
from Queue import Queue
import requests
from bs4 import BeautifulSoup
import sys
concurrent = 100
def worker():
while True:
url = q.get()
html = get_html(url)
process_html(html)
q.task_done()
def get_html(url):
try:
html = requests.get(url, timeout=5, headers={'Connection':'close'}).text
return html
except:
print "error", url
return None
def process_html(html):
if html == None:
return
soup = BeautifulSoup(html)
text = soup.get_text()
# do some more processing
# write the text to a file
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=worker)
t.daemon = True
t.start()
try:
for url in open('text.txt'):
q.put(url.strip())
q.join()
except KeyboardInterrupt:
sys.exit(1)
If the file isn't bigger than your available memory, instead of opening it with the "open" method use mmap ( https://docs.python.org/3/library/mmap.html ). It will give the same speed as if you were working with memory and not a file.
with open("test.txt") as f:
mmap_file = mmap.mmap(f.fileno(), 0)
# code that does what you need
mmap_file.close()

Webscrape multithread python 3

i have been dong a simple webscraping program to learn how to code and i made it work but i wanted to see how to make it faster. I wanted to ask how could i implement multi-threading to this program? all that the program does is open the stock symbols file and searches for the price for that stock online.
Here is my code
import urllib.request
import urllib
from threading import Thread
symbolsfile = open("Stocklist.txt")
symbolslist = symbolsfile.read()
thesymbolslist = symbolslist.split("\n")
i=0
while i<len (thesymbolslist):
theurl = "http://www.google.com/finance/getprices?q=" + thesymbolslist[i] + "&i=10&p=25m&f=c"
thepage = urllib.request.urlopen(theurl)
# read the correct character encoding from `Content-Type` request header
charset_encoding = thepage.info().get_content_charset()
# apply encoding
thepage = thepage.read().decode(charset_encoding)
print(thesymbolslist[i] + " price is " + thepage.split()[len(thepage.split())-1])
i= i+1
If you just iterate a function on a list, i recommend you the multiprocessing.Pool.map(function, list).
https://docs.python.org/3/library/multiprocessing.html?highlight=multiprocessing%20map#multiprocessing.pool.Pool.map
You need to use asyncio. That's quite neat package that could also help you with scrapping. I have created a small snippet of how to integrate with linkedin with asyncio but you can adopt it to your needs quite easily.
import asyncio
import requests
def scrape_first_site():
url = 'http://example.com/'
response = requests.get(url)
def scrape_another_site():
url = 'http://example.com/other/'
response = requests.get(url)
loop = asyncio.get_event_loop()
tasks = [
loop.run_in_executor(None, scrape_first_site),
loop.run_in_executor(None, scrape_another_site)
]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
Since default executor is ThreadPoolExecutor it will run each task in the separate thread. You can use ProcessPoolExecutor if you'd like to run tasks in process rather than threads (GIL related issues maybe).

Processing Result outside For Loop in Python

I have this simple code which fetches page via urllib:
browser_list= ['Chrome','Mozilla','Safari','Internet Explorer','Opera']
user_string_url="http://www.useragentstring.com/pages/"
for eachBrowser in browser_list:
result= urllib2.urlopen(urljoin(user_string_url,eachBrowser))
Now I can read the result via result.read() but I was wondering if all this functionality can be done outside the for loop. Because other URLs to be fetched will wait until all the result has been processed.
I want to process result outside the for loop. Can this be done?
One of the ways to do this maybe to have result as a dictionary. What you can do is:
result = {}
for eachBrowser in browser_list:
result[eachBrowser]= urllib2.urlopen(urljoin(user_string_url,eachBrowser))
and use result[BrowserName] outside the loop.
Hope this helps.
If you simply wants to access all results outside the loop just append all results to a array or dictionary as above answer.
Or if you trying to speed up your task try multithreading.
import threading
class myThread (threading.Thread):
def __init__(self, result):
threading.Thread.__init__(self)
self.result=result
def run(self):
// process your result(as self.result) here
browser_list= ['Chrome','Mozilla','Safari','Internet Explorer','Opera']
user_string_url="http://www.useragentstring.com/pages/"
for eachBrowser in browser_list:
result= urllib2.urlopen(urljoin(user_string_url,eachBrowser))
myThread(result).start() // it will start processing result on another thread and continue loop without any waiting
Its a simple way of multithrading. It may break depending on your result processing. Consider reading the documentation and some examples before you try.
You can use threads for this:
import threading
import urllib2
from urlparse import urljoin
def worker(url):
res = urllib2.urlopen(url)
data = res.read()
res.close()
browser_list = ['Chrome', 'Mozilla', 'Safari', 'Internet Explorer', 'Opera']
user_string_url='http://www.useragentstring.com/'
for browser in browser_list:
url = urljoin(user_string_url, browser)
threading.Thread(target=worker,args=[url]).start()
# wait for everyone to complete
for thread in threading.enumerate():
if thread == threading.current_thread(): continue
thread.join()
Are you using python3?, if so, you can use futures for this task:
from urllib.request import urlopen
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
browser_list = ['Chrome','Mozilla','Safari','Internet+Explorer','Opera']
user_string_url = "http://www.useragentstring.com/pages/"
def process_request(url, future):
print("Processing:", url)
print("Reading data")
print(future.result().read())
with ThreadPoolExecutor(max_workers=10) as executor:
submit = executor.submit
for browser in browser_list:
url = urljoin(user_string_url, browser) + '/'
submit(process_request, url, submit(urlopen, url))
You could also do this with yield:
def collect_browsers():
browser_list= ['Chrome','Mozilla','Safari','Internet Explorer','Opera']
user_string_url="http://www.useragentstring.com/pages/"
for eachBrowser in browser_list:
yield eachBrowser, urllib2.urlopen(urljoin(user_string_url,eachBrowser))
def process_browsers():
for browser, result in collect_browsers():
do_something (result)
This is still a synchronous call (browser 2 will not fire until browser 1 is processed) but you can keep the logic for dealing with the results separate from the logic managing the connections. You could of course also use threads to handle the processing asynchronously with or without yield
Edit
Just re-read OP and should repeat that yield doesn't provide multi-threaded, asynchronous execution in case that was not clear in my first answer!

Categories

Resources