Selenium wait a lot while searching for element - python

I am working on a software to find email addresses in source codes of websites.
But sometimes the sources of the websites are very long, so it searches for a long time.
How can I set a certain time for this and have it switch to the other website url after this time expires?
for query in my_list:
results.append(search(query, tld="com", num=3, stop=3, pause=2))
for result in results:
url = list(result)
print(*url,sep='\n')
for site in url:
driver = webdriver.Chrome()
driver.get(site)
doc = driver.page_source
emails = re.findall(r'[\w\.-]+#[\w\.-]+', doc)
for email in emails:
print(email)
results = []
start_time = time.time()
for query in my_list:
results.append(search(query, tld="com", num=3, stop=3, pause=2))
for result in results:
url = list(result)
print(*url,sep='\n')
for site in url:
driver = webdriver.Chrome()
driver.get(site)
doc = driver.page_source
emails = re.findall(r'[\w\.-]+#[\w\.-]+', doc)
for email in emails:
print(email)
if time.time() - start_time > 10:
# if 10 seconds pass do something
start_time = time.time()
time.sleep(3)
driver.close()

import time
start_time = time.time()
# your code
while True:
if time.time() - start_time > 2:
# if 2 seconds pass do something
start_time = time.time()
print("2 seconds passed")

You can wait a bit providing time using python time module as follows:
import time
for site in url:
driver = webdriver.Chrome()
driver.get(site)
time.sleep(8)

Related

imap_tools Taking Long Time to Scrape Links from Emails

I am using imap_tools to get links from emails. The emails are very small with very little text, graphics, etc. There are also not many, around 20-40 spread through the day.
When a new email arrives it takes between 10 and 25 seconds to scrape the link. This seems very long. I would have expected it to be less than 2 seconds and speed is important.
Nb. it is a shared mailbox and I cannot simply fetch unseeen emails because often other users will have opened emails before the scraper gets to them.
Can anyone see what the issue is?
import pandas as pd
from imap_tools import MailBox, AND
import re, time, datetime, os
from config import email, password
uids = []
yahooSmtpServer = "imap.mail.yahoo.com"
data = {
'today': str(datetime.datetime.today()).split(' ')[0],
'uids': []
}
while True:
while True:
try:
client = MailBox(yahooSmtpServer).login(email, password, 'INBOX')
try:
if not data['today'] == str(datetime.datetime.today()).split(' ')[0]:
data['today'] = str(datetime.datetime.today()).split(' ')[0]
data['uids'] = []
ds = str(datetime.datetime.today()).split(' ')[0].split('-')
msgs = client.fetch(AND(date_gte=datetime.date.today()))
for msg in msgs:
links = []
if str(datetime.datetime.today()).split(' ')[0] == str(msg.date).split(' ')[0] and not msg.uid in data['uids']:
mail = msg.html
if 'order' in mail and not 'cancel' in mail:
for i in re.findall(r'(https?://[^\s]+)', mail):
if 'pick' in i:
link = i.replace('"', "")
link = link.replace('<', '>').split('>')[0]
print(link)
links.append(link)
break
data['uids'].append(msg.uid)
scr_links = pd.DataFrame({'Links': links})
scr_links.to_csv('Links.csv', mode='a', header=False, index=False)
time.sleep(0.5)
except Exception as e:
print(e)
pass
client.logout()
time.sleep(5)
except Exception as e:
print(e)
print('sleeping for 5 sec')
time.sleep(1)
I think this is email server throttle timeout.
Try to see IMAP IDLE.
since 0.51.0 imap_tools has IDLE support:
https://github.com/ikvk/imap_tools/releases/tag/v0.51.0

How can I improve the aiohttp crawler speed?

import aiohttp
from bs4 import BeautifulSoup
from xlrd import open_workbook
from xlwt import Workbook
url_list = [https://www.facebook.com,https://www.baidu.com,https://www.yahoo.com,...]
#There are more than 20000 different websites in the list
#Some websites may not be accessible
keywords=['xxx','xxx'....]
start = time.time()
localtime = time.asctime(time.localtime(time.time()))
print("start time :", localtime)
choose_url=[]
url_title=[]
async def get(url, session):
try:
async with session.get(url=url,timeout=0) as response:
resp = await response.text()
soup = BeautifulSoup(resp, "lxml")
title = soup.find("title").text.strip()
for keyword in keywords:
if keyword in title:
choose_url.append(url)
url_title.append(title)
print("Successfully got url {} with resp's name {}.".format(url, title))
break
except Exception as e:
pass
async def main(urls):
connector = aiohttp.TCPConnector(ssl=False,limit=0,limit_per_host =0)
session = aiohttp.ClientSession(connector=connector)
ret = await asyncio.gather(*[get(url, session) for url in urls])
print("Finalized all. Return is a list of outputs.")
await session.close()
def write_exccel(choose_url,url_title):
#write choose_url,url_title to excel
pass
asyncio.run(main(url_list))
write_exccel(choose_url,url_title)
localtime = time.asctime(time.localtime(time.time()))
print("now time is :", localtime)
end = time.time()
print('time used:', end - start)
I have 20000 URLs to request. But it takes a long time (more than 4 or 5 hours).It just needs 3 hours if I use requests+multiprocessing(Pool 4).
I tried to use aiohttp+multiprocessing,It doesn't seem to work. Can the code be as fast as possible either by optimizing this code or using any available technology? Thanks
I don't know if the following method is fast or not.
import time
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'demo_spider'
start_urls = ["https://www.facebook.com","https://www.baidu.com","https://www.yahoo.com"] # Entry page
keywords = ['xxx','xxx']
choose_url=[]
url_title=[]
concurrencyPer1s = 10
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
title = doc.title
if title.containsOr(self.keywords):
self.choose_url.append(url.url)
self.url_title.append(title.text)
print("Successfully got url {} with resp's name {}.".format(url, title.text))
def urlCount(self):
count = Spider.urlCount(self)
if count==0:
SimplifiedMain.setRunFlag(False)
return count
start = time.time()
localtime = time.asctime(time.localtime(time.time()))
print("start time :", localtime)
SimplifiedMain.startThread(MySpider(),{"concurrency":600, "concurrencyPer1S":100, "intervalTime":0.001, "max_workers":10}) # Start download
localtime = time.asctime(time.localtime(time.time()))
print("now time is :", localtime)
end = time.time()
print('time used:', end - start)

Python multiprocessing in for loop (requests and BeautifulSoup)

I have list of a lot of links and I want to use multiprocessing to speed the proccess, here is simplified version, I need it to be ordered like this:
I tried a lot of things, process, pool etc. I always had errors, I need to do it with 4 or 8 threads and make it ordered like this. Thank you for all help. Here is code:
from bs4 import BeautifulSoup
import requests
import time
links = ["http://www.tennisexplorer.com/match-detail/?id=1672704", "http://www.tennisexplorer.com/match-detail/?id=1699387", "http://www.tennisexplorer.com/match-detail/?id=1698990" "http://www.tennisexplorer.com/match-detail/?id=1696623", "http://www.tennisexplorer.com/match-detail/?id=1688719", "http://www.tennisexplorer.com/match-detail/?id=1686305"]
data = []
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
return p1_l + " - " + p2_l + " - " + str(omega)
i = 1
start_time = time.clock()
for link in links:
data.append(essa(link, i))
i += 1
for d in data:
print(d)
print(time.clock() - start_time, "seconds")
Spawn several threads of the function and join them together:
from threading import Thread
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
print p1_l + " - " + p2_l + " - " + str(omega)
if __name__ == '__main__':
threadlist = []
for index, url in enumerate(links):
t= Thread(target=essa,args=(url, index))
t.start()
threadlist.append(t)
for b in threadlist:
b.join()
You wont get them to print in order, for the simple reason that some http responses take longer than others.
As far I can understand you have the list of links and make requests concurrently to make the process faster. Here is the sample code for multithreading. I hope this will help you. Read the documentation for concurrent futures.
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))

Python: parallel loop

I have dataframe and column
event_time
avito.ru/morozovsk/avtomobili/honda_accord_1998_799656153
avito.ru/donetck/avtomobili/honda_accord_2000_829068734
avito.ru/taganrog/avtomobili/volkswagen_passat_1997_839237476
avito.ru/volgodonsk/avtomobili/volkswagen_golf_1993_657720225
avito.ru/taganrog/avtomobili/peugeot_206_2008_818743294
avito.ru/bataysk/avtomobili/peugeot_206_2002_825498743
and I need to open html page. I use proxy and use code
for url in urls:
m = re.search(r'avito.ru\/[a-z]+\/avtomobili\/[a-z0-9_]+$', url)
if m is not None:
url = 'https://www.' + url
proxy = pd.read_excel('proxies.xlsx')
proxies = proxy.proxy.values.tolist()
for i, proxy in enumerate(proxies):
# print "Trying HTTP proxy %s" % proxy
try:
result = urllib.urlopen(url, proxies={'http': proxy}).read()
if 'Мы обнаружили, что запросы, поступающие с вашего IP-адреса, похожи на автоматические' in result:
raise Exception
else:
page = page.read()
soup = BeautifulSoup(page, 'html.parser')
price = soup.find('span', itemprop="price")
print price
except:
print "Trying next proxy %s in 30 seconds" % proxy
time.sleep(30)
But it need a lot of time! I want to do it faster.
I try to add this code to func
def get_page(url):
and next
if __name__ == '__main__':
pool = Pool(processes=8)
pool.map(get_page, urls)
I want to open some url with some proxy, but it works wrong for me.
Is any way to solve my task?

No performance gain with python threading

I'm writing a parallel crawler using Python and I'm storing some information in Mongodb. After testing I realized that my code, even though is using threading, is not parallel. It make no difference whether I use a single thread or 10 or 50 threads. I can't figure out why.
EDIT: From what I can see most of the processing time is taken up by soup = BeautifulSoup(html). Could it be that this command can't get parallelized using threads?
from threading import Thread
import Queue
import urllib2
import re
from BeautifulSoup import *
from urlparse import urljoin
from pymongo import MongoClient
from urlparse import urlparse
import time
import hashlib
start_time = time.time()
level = 1
client = MongoClient()
db = client.crawler
visited = {}
def doWork():
while True:
try:
myUrl = q_start.get()
except:
continue
try:
c=urllib2.urlopen(myUrl)
except:
q_start.task_done()
continue
parsed_url = urlparse(myUrl)
html=c.read()
try:
soup = BeautifulSoup(html)
except:
q_start.task_done()
continue
txt = soup.prettify()
links = soup('a')
m = hashlib.md5(myUrl)
db.urls.insert(
{
"url":myUrl,
"HTML":txt,
"level":level,
"domain":parsed_url.netloc,
"md5":m.hexdigest()
}
)
for link in links:
if('href' in dict(link.attrs)):
url = urljoin(myUrl,link['href'])
if url.find("'")!=-1:
continue
url=url.split('#')[0]
if url[0:4] == 'http':
if url in visited:
continue
else:
visited[url]=True
q_new.put(url)
q_start.task_done()
q_start = Queue.Queue()
q_new = Queue.Queue()
for i in range(50):
t = Thread(target=doWork)
t.daemon = True
t.start()
q_start.put("http://google.com")
q_start.join()
for i in range(2,5):
print "Depth: "
print i
print time.time() - start_time
level += 1
print q_new.qsize()
q_aux = q_new
q_new = Queue.Queue()
while q_aux.empty() != True:
x = q_aux.get()
q_start.put(x)
q_start.join()
print "end"
print time.time() - start_time

Categories

Resources