Using WorkerPool to multithread through a list of URLs - python

I'm trying to use multithreads to go through a txt file of urls and scrape the contents found at each url. This works for about 20 URLs (not consistent how many) but then consistently gets stuck on the last url in the file. It doesn't seem to be doing them in order.
I have no idea why it's getting stuck or where to start so thank you so much for your help.
from bs4 import BeautifulSoup, SoupStrainer
import urllib3
import urllib2
import io
import os
import re
import workerpool
from urllib2 import Request, urlopen, URLError
NUM_SOCKETS = 3
NUM_WORKERS = 5
urlfile = open("dailynewsurls.txt",'r') # read one line at a time until end of file
http = urllib3.PoolManager(maxsize=NUM_SOCKETS)
workers = workerpool.WorkerPool(size=NUM_WORKERS)
class MyJob(workerpool.Job):
def __init__(self, url):
self.url = url
def run(self):
r = http.request('GET', self.url)
req = urllib2.Request(url)
try:
page = urllib2.urlopen(req)
except:
print "had to skip one"
return
pagecontent = page.read() # get a file-like object at this url
#this tells it to soup the page that is at the url above
soup = BeautifulSoup(pagecontent)
#this tells it to find the string in the first instance of each of the tags in the parenthesis
title = soup.find_all('title')
article = soup.find_all('article')
try:
title = str(title[0].get_text().encode('utf-8'))
except:
print "had to skip one"
return
try:
article = str(article[0].get_text().encode('utf-8'))
except:
print "had to skip one"
return
try:
# make the file using the things above
output_files_pathname = 'DailyNews/' # path where output will go
new_filename = title + ".txt"
# write each of the things defined into the text file
outfile = open(output_files_pathname + new_filename,'w')
outfile.write(title)
outfile.write("\n")
outfile.write(article)
outfile.close()
print "%r added as a text file" % title
return
except:
print "had to skip one"
return
return
for url in urlfile:
workers.put(MyJob(url))
workers.shutdown()
workers.wait()
print "All done."
Here's an example list of the urls:
http://www.nydailynews.com/entertainment/tv-movies/x-factor-season-2-episode-2-recap-oops-britney-spears-article-1.1159546
http://www.nydailynews.com/new-york/brooklyn/lois-mclohon-resurfaced-iconic-daily-news-coney-island-cheesecake-photo-brings-back-memories-50-year-long-romance-article-1.1160457
http://www.nydailynews.com/new-york/uptown/espaillat-linares-rivals-bitter-history-battle-state-senate-seat-article-1.1157994
http://www.nydailynews.com/sports/baseball/mlb-power-rankings-yankees-split-orioles-tumble-rankings-nationals-shut-stephen-strasburg-hang-top-spot-article-1.1155953
http://www.nydailynews.com/news/national/salon-sell-internet-online-communities-article-1.1150614
http://www.nydailynews.com/sports/more-sports/jiyai-shin-wins-women-british-open-dominating-fashion-record-nine-shot-victory-article-1.1160894
http://www.nydailynews.com/entertainment/music-arts/justin-bieber-offered-hockey-contract-bakersfield-condors-minor-league-team-article-1.1157991
http://www.nydailynews.com/sports/baseball/yankees/umpire-blown-call-9th-inning-dooms-yankees-5-4-loss-baltimore-orioles-camden-yards-article-1.1155141
http://www.nydailynews.com/entertainment/gossip/kellie-pickler-shaving-head-support-best-friend-cancer-fight-hair-article-1.1160938
http://www.nydailynews.com/new-york/secret-103-000-settlement-staffers-accused-assemblyman-vito-lopez-sexual-harassment-included-penalty-20k-involved-talked-details-article-1.1157849
http://www.nydailynews.com/entertainment/tv-movies/ricki-lake-fun-adds-substance-new-syndicated-daytime-show-article-1.1153301
http://www.nydailynews.com/sports/college/matt-barkley-loyalty-usc-trojans-contention-bcs-national-championship-article-1.1152969
http://www.nydailynews.com/sports/daily-news-sports-photos-day-farewell-andy-roddick-world-1-u-s-open-champ-retires-loss-juan-martin-del-potro-article-1.1152827
http://www.nydailynews.com/entertainment/gossip/britney-spears-made-move-relationship-fiance-jason-trawick-reveals-article-1.1152722
http://www.nydailynews.com/new-york/brooklyn/brooklyn-lupus-center-tayumika-zurita-leads-local-battle-disease-difficult-adversary-article-1.1153494
http://www.nydailynews.com/life-style/fashion/kate-middleton-prabal-gurung-dress-sells-hour-myhabit-site-sold-1-995-dress-599-article-1.1161583
http://www.nydailynews.com/news/politics/obama-romney-campaigns-vie-advantage-president-maintains-lead-article-1.1161540
http://www.nydailynews.com/life-style/free-cheap-new-york-city-tuesday-sept-11-article-1.1155950
http://www.nydailynews.com/news/world/dozens-storm-embassy-compound-tunis-article-1.1159663
http://www.nydailynews.com/opinion/send-egypt-message-article-1.1157828
http://www.nydailynews.com/sports/more-sports/witnesses-feel-sheryl-crow-lance-amstrong-activities-article-1.1152899
http://www.nydailynews.com/sports/baseball/yankees/hiroki-kuroda-replacing-cc-sabathia-yankees-ace-pitcher-real-possibility-playoffs-looming-article-1.1161812
http://www.nydailynews.com/life-style/eats/finland-hosts-pop-down-restaurant-belly-earth-262-feet-underground-article-1.1151523
http://www.nydailynews.com/sports/more-sports/mighty-quinn-sept-23-article-1.1165584
http://www.nydailynews.com/sports/more-sports/jerry-king-lawler-stable-condition-suffering-heart-attack-wwe-raw-broadcast-monday-night-article-1.1156915
http://www.nydailynews.com/news/politics/ambassador-chris-stevens-breathing-libyans-found-american-consulate-rescue-article-1.1161454
http://www.nydailynews.com/news/crime/swiss-banker-bradley-birkenfeld-104-million-reward-irs-blowing-whistle-thousands-tax-dodgers-article-1.1156736
http://www.nydailynews.com/sports/hockey/nhl-board-governors-votes-favor-lockout-league-players-association-fail-reach-agreement-cba-article-1.1159131
http://www.nydailynews.com/news/national/iphone-5-works-t-network-article-1.1165543
http://www.nydailynews.com/sports/baseball/yankees/yankees-broadcasters-michael-kay-ken-singleton-opportunity-important-statement-article-1.1165479
http://www.nydailynews.com/news/national/boss-year-michigan-car-dealer-retires-employees-1-000-year-service-article-1.1156763
http://www.nydailynews.com/entertainment/tv-movies/hero-denzel-washington-clint-eastwood-article-1.1165538
http://www.nydailynews.com/sports/football/giants/ny-giants-secondary-roasted-tony-romo-dallas-cowboys-offense-article-1.1153055
http://www.nydailynews.com/news/national/hide-and-seek-tragedy-3-year-old-suffocates-hiding-bean-bag-article-1.1160138

I would try using the threading module; here is something I think is working:
from bs4 import BeautifulSoup, SoupStrainer
import threading
import urllib2
def fetch_url(url):
urlHandler = urllib2.urlopen(url)
html = urlHandler.read()
#this tells it to soup the page that is at the url above
soup = BeautifulSoup(html)
#this tells it to find the string in the first instance of each of the tags in the parenthesis
title = soup.find_all('title')
article = soup.find_all('article')
try:
title = str(title[0].get_text().encode('utf-8'))
except:
print "had to skip one bad title\n"
return
try:
article = str(article[0].get_text().encode('utf-8'))
except:
print "had to skip one bad article"
return
try:
# make the file using the things above
output_files_pathname = 'DailyNews/' # path where output will go
new_filename = title + ".txt"
# write each of the things defined into the text file
outfile = open(output_files_pathname + new_filename, 'w')
outfile.write(title)
outfile.write("\n")
outfile.write(article)
outfile.close()
print "%r added as a text file" % title
return
except:
print "had to skip one cant write file"
return
return
with open("dailynewsurls.txt", 'r') as urlfile:
# read one line at a time until end of file
threads = [threading.Thread(target=fetch_url, args=(url,)) for url in urlfile]
for thread in threads:
thread.start()
for thread in threads:
thread.join()

Related

how to speed up my process

I wrote a script that will web scrape data for a list of stocks. The scraper has to get the data from 2 separate pages so each stock symbol must scrape 2 different pages. If I run the process on a list that is 1000 items long it will take around 30 minutes to complete. It's not horrible, I can set it and forget it, but I'm wondering if there is a way to speed up the process. Maybe store the data and wait to write it all at the end instead of on each loop? Any other ideas appreciated.
import requests
from BeautifulSoup import BeautifulSoup
from progressbar import ProgressBar
import csv
symbols = {'AMBTQ','AABA','AAOI','AAPL','AAWC','ABEC','ABQQ','ACFN','ACIA','ACIW','ACLS'}
pbar = ProgressBar()
with open('industrials.csv', "ab") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerow(['Symbol','5 Yr EPS','EPS TTM'])
for s in pbar(symbols):
try:
url1 = 'https://research.tdameritrade.com/grid/public/research/stocks/fundamentals?symbol='
full1 = url1 + s
response1 = requests.get(full1)
html1 = response1.content
soup1 = BeautifulSoup(html1)
for hist_div in soup1.find("div", {"data-module-name": "HistoricGrowthAndShareDetailModule"}):
EPS5yr = hist_div.find('label').text
except Exception as e:
EPS5yr = 'Bad Data'
pass
try:
url2 = 'https://research.tdameritrade.com/grid/public/research/stocks/summary?symbol='
full2 = url2 + s
response2 = requests.get(full2)
html2 = response2.content
soup2 = BeautifulSoup(html2)
for div in soup2.find("div", {"data-module-name": "StockSummaryModule"}):
EPSttm = div.findAll("dd")[11].text
except Exception as e:
EPSttm = "Bad data"
pass
writer.writerow([s,EPS5yr,EPSttm])

Having difficulties with my python script

I am new to Python and can use a little help. I am trying to write a script that will go out to a specific web site and download multiple .gif images in different spots at that site. Can anyone assist me in the right direction to take. This is the first one I have tried to make.
Here is what i got so far.
from http:// import http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/ as bs
import urlparse
from urllib2 import urlopen
from urllib import urlretrieve
import os
import sys
def main(url, out_folder="C:\Users\jerry\Desktop\Heli/"):
"""Downloads all the images at 'url' to /test/"""
http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/ = bs(urlopen(url))
parsed = list(urlparse.urlparse(url))
for image in http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/.findAll("gif"):
print "gif: %(src)s" % image
filename = gif["src"].split("/")[-1]
parsed[2] = gif["src"]
outpath = os.path.join(out_folder, filename)
if gif["src"].lower().startswith("http"):
urlretrieve(gif["src"], outpath)
else:
urlretrieve(urlparse.urlunparse(parsed), outpath)
def _usage():
print "usage: python dumpimages.py http:folkworm.ceri.memphis.edu/heli/heli_bb_ag/ [outpath]"
if __name__ == "__main__":
url = sys.argv[-1]
out_folder = "/test/"
if not url.lower().startswith("http"):
out_folder = sys.argv[-1]
url = sys.argv[-2]
if not url.lower().startswith("http"):
_usage()
sys.exit(-1)
main(url, out_folder)
Here is the basic idea.
>>> import requests
>>> from bs4 import BeautifulSoup
>>> item = requests.get('http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/')
>>> page = item.text
>>> soup = BeautifulSoup(page, 'lxml')
>>> links = soup.findAll('a')
>>> for link in links:
... if '.gif' in link.attrs['href']:
... print (link.attrs['href'])
... break
...
CCAR_HHZ_AG_00.2017012700.gif?v=1485534942
The break statement is there just to interrupt the script so that it doesn't print all of the names of the gif's. The next step would be to add code to that loop to concatenate the URL mentioned in the requests.get to the name of each gif and do a requests.get for it. This time though you would do, say, image = item.content to get the image in bytes, which you could write to a file of your choice.
EDIT: Fleshed out. Note you still need to arrange to provide one file name for each output file.
>>> import requests
>>> from bs4 import BeautifulSoup
>>> URL = 'http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/'
>>> item = requests.get(URL)
>>> page = item.text
>>> soup = BeautifulSoup(page, 'lxml')
>>> links = soup.findAll('a')
>>> for link in links:
... if '.gif' in link.attrs['href']:
... print (link.attrs['href'])
... pic = requests.get(URL + link.attrs['href'])
... image = pic.content
... open('pic.gif', 'wb').write(image)
... break
...
CCAR_HHZ_AG_00.2017012700.gif?v=1485535857
100846

BeautifulSoup findAll HTML class with multiple variable class inputs

I have the following code which scrapes a website for divs with the class "odd" or "even". I'd like to make "odd" and "even" an argument my function takes in, which would allow me to add other divs as well. Here is my code:
#
# Imports
#
import urllib2
from bs4 import BeautifulSoup
import re
import os
from pprint import pprint
#
# library
#
def get_soup(url):
page = urllib2.urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, "html.parser")
body = soup.findAll("tr", ["even", "odd"])
string_list = str([i for i in body])
return string_list
def save_to_file(path, soup):
with open(path, 'w') as fhandle:
fhandle.write(soup)
#
# script
#
def main():
url = r'URL GOES HERE'
path = os.path.join('PATH GOES HERE')
the_soup = get_soup(url)
save_to_file(path, the_soup)
if __name__ == '__main__':
main()
I'd like to incorporate *args into the code so the get_soup function would look like this:
def get_soup(url, *args):
page = urllib2.urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, "html.parser")
body = soup.findAll("tr", [args])
string_list = str([i for i in body])
return string_list
def main():
url = r'URL GOES HERE'
path = os.path.join('PATH GOES HERE')
the_soup = get_soup(url, "odd", "even")
save_to_file(path, the_soup)
Unfortunately, this isn't working. Ideas?
Don't put args in a list, args is already a tuple so just pass that:
body = soup.findAll("tr", args)
If you [args], you would end up with something like [("odd","even")].
Also str([i for i in body]) makes no real sense, it would be the same as just doing str(body) but I don't see how that format could be useful.

Fetching the first image from a website that belongs to the post

I've written a program that fetches the desired information from a blog or any page. The next thing, I want to achieve is to retrieve the first image from that page, that belongs to the respective post (Just like Facebook does when a post is shared).
I was able to achieve this to some extent by fetching the first image with an alt tag (since many websites don't have alt tags in their logos and icons etc, the first one should belong to the post). But this does not seem to work in some cases. Is there any other (better) way to achieve this?
I'm using python 2.7.9 and BeautifulSoup 4.
d = feedparser.parse('http://rss.cnn.com/rss/edition.rss')
for entry in d.entries:
try:
if entry.title is not None:
print entry.title
print ""
except Exception, e:
print e
try:
if entry.link is not None:
print entry.link
print ""
except Exception, e:
print e
try:
if entry.published[5:16] is not None:
print entry.published[5:16]
print ""
except Exception, e:
print e
try:
if entry.category is not None:
print entry.category
print ""
except Exception, e:
print e
try:
if entry.get('summary', '') is not None:
print entry.get('summary', '')
print ""
except Exception, e:
print e
time.sleep(5)
r = requests.get(entry.link, headers = {'User-Agent' : 'Safari/534.55.3 '})
soup = BeautifulSoup(r.text, 'html.parser')
for img in soup.findAll('img'):
if img.has_attr('alt'):
if img['src'].endswith('.jpg') == True or img['src'].endswith('.png') == True:
print img['src']
break
It is probably more practical to take a look at the opengraph module:
https://pypi.python.org/pypi/opengraph/0.5
and correct it the way you like.
It will fetch "first image" from HTML code or use og:image.
If you want to learn, you can also do it by looking at the source code. The module uses BeautifulSoup too.
I needed the following monkeypatch to activate scraping as fallback:
import re
from bs4 import BeautifulSoup
from opengraph import OpenGraph
def parser(self, html):
"""
"""
if not isinstance(html,BeautifulSoup):
doc = BeautifulSoup(html, from_encoding='utf-8')
else:
doc = html
ogs = doc.html.head.findAll(property=re.compile(r'^og'))
for og in ogs:
self[og[u'property'][3:]]=og[u'content']
# Couldn't fetch all attrs from og tags, try scraping body
if not self.is_valid() and self.scrape:
for attr in self.required_attrs:
if not hasattr(self, attr):
try:
self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
except AttributeError:
pass
OpenGraph.parser = parser
OpenGraph.scrape = True # workaround for some subtle bug in opengraph
You may need to handle relatives URLs in the image sources, but it is quite straightforward with use of urljoin from urlparse
import opengraph
...
page = opengraph.OpenGraph(url=link, scrape=True)
...
if page.is_valid():
...
image_url = page.get('image', None)
...
if not image_url.startswith('http'):
image_url = urljoin(page['_url'], page['image'])
(some check are omitted for brevity from the code fragment)

Gevent link crawler

Here i have written the code using python and beautiful soup to parse all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
But this code is very slow. How can i improve it by using asynchronous programming using gevents in python ?
Code
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup.BeautifulSoup(html_code)
page_links = []
try :
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href,
( a.get('href') for a in self.soup.findAll('a') ) )
except Exception as e: # Magnificent exception handling
print 'Error: ',e
pass
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
Update 1
import gevent.monkey; gevent.monkey.patch_thread()
from bs4 import BeautifulSoup
import urllib2
import itertools
import random
import urlparse
import sys
import gevent.monkey; gevent.monkey.patch_all(thread=False)
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup(html_code)
page_links = []
try :
for link in [h.get('href') for h in self.soup.find_all('a')]:
print "Found link: '" + link + "'"
if link.startswith('http'):
print 'entered in if link: ',link
page_links.append(link)
print "Adding link" + link + "\n"
elif link.startswith('/'):
print 'entered in elif link: ',link
parts = urlparse.urlparse(self.current_page)
page_links.append(parts.scheme + '://' + parts.netloc + link)
print "Adding link " + parts.scheme + '://' + parts.netloc + link + "\n"
else:
print 'entered in else link: ',link
page_links.append(self.current_page+link)
print "Adding link " + self.current_page+link + "\n"
except Exception, ex: # Magnificent exception handling
print ex
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
crawling_greenlets = []
for i in range(3):
crawling_greenlets.append(gevent.spawn(self.open))
gevent.joinall(crawling_greenlets)
#while len(self.visited_links) < 4 or (self.visited_links == self.links):
# self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
import gevent and make sure monkey-patching is done to make standard library calls non-blocking and aware of gevent:
import gevent
from gevent import monkey; monkey.patch_all()
(you can selectively decide what has to be monkey-patched, but let's say it is not
your problem at the moment)
In your run, make your open function to be called inside a greenlet. run can
return the greenlet object, so you can wait for it whenever you need to get the
results using gevent.joinall for example. Something like this:
def run(self):
return gevent.spawn(self.open)
c1 = Crawler()
c2 = Crawler()
c3 = Crawler()
crawling_tasks = [c.run() for c in (c1,c2,c3)]
gevent.joinall(crawling_tasks)
print [c.links for c in (c1, c2, c3)]

Categories

Resources