I wrote a script that "parses" all domains from the file. After the launch, everything works as it should. But when there are several domains left at the end, it gets stuck. Sometimes it takes a long time to parse the last couple of domains. I can't figure out what the problem is. Who has faced such a situation? Tell me how to cure it.
After the launch, everything works out very quickly (as it should) until the end. At the end, it stops when there are several domains left. There is no difference, 1000 domains or 10 000 domains.
Complete code:
import re
import sys
import json
import requests
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
pool = 100
with open("Rules.json") as file:
REGEX = json.loads(file.read())
ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'}
def Domain_checker(domain):
try:
r = requests.get("http://" + domain, verify=False, headers=ua)
r.encoding = "utf-8"
for company in REGEX.keys():
for type in REGEX[company]:
check_entry = 0
for ph_regex in REGEX[company][type]:
if bool(re.search(ph_regex, r.text)) is True:
check_entry += 1
if check_entry == len(REGEX[company][type]):
title = BeautifulSoup(r.text, "lxml")
Found_domain = "\nCompany: {0}\nRule: {1}\nURL: {2}\nTitle: {3}\n".format(company, type, r.url, title.title.text)
print(Found_domain)
with open("/tmp/__FOUND_DOMAINS__.txt", "a", encoding='utf-8', errors = 'ignore') as file:
file.write(Found_domain)
except requests.exceptions.ConnectionError:
pass
except requests.exceptions.TooManyRedirects:
pass
except requests.exceptions.InvalidSchema:
pass
except requests.exceptions.InvalidURL:
pass
except UnicodeError:
pass
except requests.exceptions.ChunkedEncodingError:
pass
except requests.exceptions.ContentDecodingError:
pass
except AttributeError:
pass
except ValueError:
pass
return domain
if __name__ == '__main__':
with open(sys.argv[1], "r", encoding='utf-8', errors = 'ignore') as file:
Domains = file.read().split()
pool = 100
print("Pool = ", pool)
results = ThreadPool(pool).imap_unordered(Domain_checker, Domains)
string_num = 0
for result in results:
print("{0} => {1}".format(string_num, result))
string_num += 1
with open("/tmp/__FOUND_DOMAINS__.txt", encoding='utf-8', errors = 'ignore') as found_domains:
found_domains = found_domains.read()
print("{0}\n{1}".format("#" * 40, found_domains))
requests.get("http://" + domain, headers=ua, verify=False, timeout=10)
The problem is resolved after installing timeout
Thank you to the user with the nickname "eri" (https://ru.stackoverflow.com/users/16574/eri) :)
Related
This is the code I am using for downloading the images from Google page. This code is taking time in Evaluating and downloading the images. Hence, I thought of using the Beautifulsoup Library for faster evaluation and download. Check the below original code:
import time
import sys
import os
import urllib2
search_keyword = ['Australia']
keywords = [' high resolution']
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
page = response.read()
return page
except:
return"Page Not found"
def _images_get_next_item(s):
start_line = s.find('rg_di')
if start_line == -1:
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
def _images_get_all_items(page):
items = []
while True:
item, end_content = _images_get_next_item(page)
if item == "no_links":
break
else:
items.append(item)
time.sleep(0.1)
page = page[end_content:]
return items
t0 = time.time()
i= 0
while i<len(search_keyword):
items = []
iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
print (iteration)
print ("Evaluating...")
search_keywords = search_keyword[i]
search = search_keywords.replace(' ','%20')
try:
os.makedirs(search_keywords)
except OSError, e:
if e.errno != 17:
raise
pass
j = 0
while j<len(keywords):
pure_keyword = keywords[j].replace(' ','%20')
url = 'https://www.google.com/search?q=' + search + pure_keyword + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (download_page(url))
time.sleep(0.1)
items = items + (_images_get_all_items(raw_html))
j = j + 1
print ("Total Image Links = "+str(len(items)))
print ("\n")
info = open('output.txt', 'a')
info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n")
info.close()
t1 = time.time()
total_time = t1-t0
print("Total time taken: "+str(total_time)+" Seconds")
print ("Starting Download...")
k=0
errorCount=0
while(k<len(items)):
from urllib2 import Request,urlopen
from urllib2 import URLError, HTTPError
try:
req = Request(items[k], headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
response = urlopen(req,None,15)
output_file = open(search_keywords+"/"+str(k+1)+".jpg",'wb')
data = response.read()
output_file.write(data)
response.close();
print("completed ====> "+str(k+1))
k=k+1;
except IOError:
errorCount+=1
print("IOError on image "+str(k+1))
k=k+1;
except HTTPError as e:
errorCount+=1
print("HTTPError"+str(k))
k=k+1;
except URLError as e:
errorCount+=1
print("URLError "+str(k))
k=k+1;
i = i+1
print("\n")
print("Everything downloaded!")
print("\n"+str(errorCount)+" ----> total Errors")
I thought editing the below code, will help in making the code work with BeautifulSoup Library and my work will be completed faster:
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
#response = urllib2.urlopen(req)
#page = response.read()
return BeautifulSoup(urlopen(Request(req)), 'html.parser')
except:
return"Page Not found"
But the above code is returning blank. Kindly, let me know what I might do to make the code work excellently well with BeautifulSoup without any trouble.
You can't just pass Google headers like that. The search engine is ALOT more complex than simply substituting some keywords into a GET URL.
HTML is a markup language only useful for one way rendering of human readable information. For your application, you need machine readable markup rather than trying to decipher human readable text. Google already has a very comprehensive API https://developers.google.com/custom-search/ which is easy to use, and a much better way of achieving this than using BeautifulSoup
I use python to download images from some website, some times the image's content-length is zero. The image can be accessed normally in web browser.
I have tried three methodes, and get the same result, so how to resolve this problem?
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 20 13:51:42 2017
"""
import urllib
import urllib2
import re
import uuid
import os
import requests
from lxml import etree
from multiprocessing import Pool
url = 'https://www.sina.com.cn/'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
request = urllib2.Request(url)
request.add_header('User-Agent', user_agent)
response = urllib2.urlopen(request)
content = response.read()
tree=etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
node=tree.xpath("//img/#src")
dic1={}
dic2={}
localPath='E:\\pictures\\'
def generateFileName():
return str(uuid.uuid1())
def createFileWithFileName(localPathParam,fileName):
totalPath=localPathParam+'\\'+fileName
if not os.path.exists(totalPath):
file=open(totalPath,'wb')
file.close()
return totalPath
def worker(i):
path = node[i]
if not (dic1.has_key(path)):
dic1[path] = 1
index = path.rfind('/')
suffix = path[index+1:]
filename = suffix
#filename = generateFileName()+'.'+suffix
if(re.search(r'^(https?:)?\/\/', path)):
#print('save picture %s as %s' % (path,filename))
'''
#this code get the same result too
try:
urllib.urlretrieve(path, createFileWithFileName(localPath, filename))
except Exception, ex:
print(ex.message)
'''
with open(localPath + filename, 'wb') as handle:
response = requests.get(path, timeout=60)
if not response.ok:
print response
else:
print 'wrong when get ' + path
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
'''
#this code get the same result too
try:
req = urllib2.Request(path)
req.add_header('User-Agent', user_agent)
picture = urllib2.urlopen(url=path, timeout=5).read()
document = open(localPath+filename,'wb')
document.write(picture)
document.close()
except Exception, ex:
print(ex.message)
'''
if __name__=='__main__':
p = Pool()
for i in range(len(node)):
p.apply_async(worker, args=(i,))
print 'Waiting for all subprocesses done...'
p.close()
p.join()
print 'All subprocesses done.'
Wrote this crawler in Python, it dumps several parameters to JSON output file based on the input list of domains.
Have this question:
Do I need to close the HTTP connection in each thread? Input data is ca. 5 Million items. It process at the beginning at a rate ca. 50 iterations per second, but later after some time it drops to 1-2 per second and/or hangs (no kernel messages and no errors on stdout)? Can this be code or is network limiting related? I suspect software since when I restart it, it starts again with high rate (ca. 50 iteration per second)
Any tips how to improve the code below are also welcome, especially improve on speed and crawling throughput.
Code in questions:
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\""+item+"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
#print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
Update 1:
Closing the Socket and FileDescriptor makes it work better, does not seem to hang anymore after some time. Performance is 50 reqs/sec on home laptop and ca 100 req/sec on a VPS
from threading import Thread
import httplib, sys
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
import json
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
realsock = response.fp._sock.fp._sock
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
realsock.close()
response.close()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\",\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":"+json.dumps(item)+",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\"," + "\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":\"\",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
The handles will be automatically garbage collected, but, you will be better off closing the handles yourself, especially as you are doing this in a tight loop.
You also asked for suggestions for improvement. A big one would be to stop using urllib2 and start using requests instead.
There are many possible options, why your crawling rate drops.
1.) Take care not to crawl to much data from the same domain. Some web servers are configured just to allow one connection per IP address in parallel.
2.) Try to send randomized browser-like http headers (user-agent, referrer, ...) to prevent web server scraping protection, if set.
3.) Use a mature http (parallel) library, like pycurl (has MultiCurl) or requests (grequests). They perform faster for sure.
im checking my customers countries so as to i know which service i can offer bla bla......
so the the problems is the threads blocks for example it check 15-20 and block, i want a solution to keep it continuing
the code is:
import requests
import re
from sys import argv
from Queue import Queue
from threading import Thread
e = argv[1]
emails = open(e, 'r').readlines()
emails = map(lambda s: s.strip(), emails)
valid=[]
def base(email):
xo = requests.get("http://www.paypal.com/xclick/business="+email, headers={"User-Agent":"Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0"}).text
x = re.search("s.eVar36=\"(.*?)\";", xo)
try:
if x.group(1) != "":
print "%s === %s" % (email,x.group(1))
w=open(str(x.group(1))+".txt", 'a')
w.write(email+"\n")
valid.append(email)
except:
pass
def work():
email=q.get()
base(email)
q.task_done()
THREADS = 25
q=Queue()
for i in range(THREADS):
t=Thread(target=work())
t.daemon=True
t.start()
if (len(argv)>0):
for email in emails:
q.put(email)
q.join()`enter code here
thanks in advance
Your problem is that you call work() instead of passing the work function when creating your threads. Instead of putting changes in your code, consider moving the python's ThreadPool which does the heavy lifting for you. Here's an example that implements what you want.
map calls your worker for each email in the iterator and returns the worker's result as an iterator (python 3) or list (python 2). Your worker returns a valid email or None for each email its given, so you just have to filter out the Nones at the end.
import requests
import re
from sys import argv
import multiprocessing.pool
e = argv[1]
emails = [line.strip() for line in open(e)]
def base(email):
print("getting email {}".format(email))
try:
xo = requests.get("http://www.paypal.com/xclick/business="+email, headers={"User-Agent":"Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0"}).text
x = re.search("s.eVar36=\"(.*?)\";", xo)
try:
if x.group(1) != "":
print "%s === %s" % (email,x.group(1))
with open(str(x.group(1))+".txt", 'a') as w:
w.write(email+"\n")
return email
except:
pass
except requests.exceptions.RequestException as e:
print(e)
THREADS = 25
pool = multiprocessing.pool.ThreadPool(THREADS)
valid = [email for email in pool.map(base, emails, chunksize=1) if email]
print(valid)
pool.close()
Hi guys i'm fairly new in python. what i'm trying to do is to move my old code into multiprocessing however i'm facing some errors that i hope anyone could help me out. My code is used to check a few thousand links given in a text form to check for certain tags. Once found it will output it to me. Due to the reason i have a few thousand links to check, speed is an issue and hence the need for me to move to multi processing.
Update: i'm having return errors of HTTP 503 errors. Am i sending too much request or am i missin gout something?
Multiprocessing code:
from mechanize import Browser
from bs4 import BeautifulSoup
import sys
import socket
from multiprocessing.dummy import Pool # This is a thread-based Pool
from multiprocessing import cpu_count
br = Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
no_stock = []
def main(lines):
done = False
tries = 1
while tries and not done:
try:
r = br.open(lines, timeout=15)
r = r.read()
soup = BeautifulSoup(r,'html.parser')
done = True # exit the loop
except socket.timeout:
print('Failed socket retrying')
tries -= 1 # to exit when tries == 0
except Exception as e:
print '%s: %s' % (e.__class__.__name__, e)
print sys.exc_info()[0]
tries -= 1 # to exit when tries == 0
if not done:
print('Failed for {}\n'.format(lines))
table = soup.find_all('div', {'class' : "empty_result"})
results = soup.find_all('strong', style = 'color: red;')
if table or results:
no_stock.append(lines)
if __name__ == "__main__":
r = br.open('http://www.randomweb.com/') #avoid redirection
fileName = "url.txt"
pool = Pool(processes=2)
with open(fileName, "r+") as f:
lines = pool.map(main, f)
with open('no_stock.txt','w') as f :
f.write('No. of out of stock items : '+str(len(no_stock))+'\n'+'\n')
for i in no_stock:
f.write(i + '\n')
Traceback:
Traceback (most recent call last):
File "test2.py", line 43, in <module>
lines = pool.map(main, f)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 567, in get
raise self._value
UnboundLocalError: local variable 'soup' referenced before assignment
my txt file is something like this:-
http://www.randomweb.com/item.htm?uuid=44733096229
http://www.randomweb.com/item.htm?uuid=4473309622789
http://www.randomweb.com/item.htm?uuid=447330962291
....etc
from mechanize import Browser
from bs4 import BeautifulSoup
import sys
import socket
from multiprocessing.dummy import Pool # This is a thread-based Pool
from multiprocessing import cpu_count
br = Browser()
no_stock = []
def main(line):
done = False
tries = 3
while tries and not done:
try:
r = br.open(line, timeout=15)
r = r.read()
soup = BeautifulSoup(r,'html.parser')
done = True # exit the loop
except socket.timeout:
print('Failed socket retrying')
tries -= 1 # to exit when tries == 0
except:
print('Random fail retrying')
print sys.exc_info()[0]
tries -= 1 # to exit when tries == 0
if not done:
print('Failed for {}\n'.format(i))
table = soup.find_all('div', {'class' : "empty_result"})
results = soup.find_all('strong', style = 'color: red;')
if table or results:
no_stock.append(i)
if __name__ == "__main__":
fileName = "url.txt"
pool = Pool(cpu_count() * 2) # Creates a Pool with cpu_count * 2 threads.
with open(fileName, "rb") as f:
lines = pool.map(main, f)
with open('no_stock.txt','w') as f :
f.write('No. of out of stock items : '+str(len(no_stock))+'\n'+'\n')
for i in no_stock:
f.write(i + '\n')
pool.map takes two parameters, the fist is a function(in your code, is main), the other is an iterable, each item of iterable will be a parameter of the function(in your code, is each line of the file)