Accelerate 2 loops with regex to find email address on website - python

I need help to find email adress on website. After some research, I found the solution but it's so long, I have a lot of datas (more than 90 000) and my code never stop.
Do you know tips to optimize/accelerate my code ?
This is my list of the URL:
http://etsgaidonsarl.site-solocal.com/
http://fr-fr.facebook.com/people/
http://ipm-mondia.com/
http://lfgenieclimatique.fr/
http://vpcinstallation.site-solocal.com
http://www.cavifroid.fr/
http://www.clim-monnier.com/
http://www.climacool.net/
I use 2 loops. The first is to find all pages of a website because the email adresse is not every time on the first page.
In the second loop, I scrall the page to find the email address, the code :
EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?\.)+[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-zA-Z]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
I think my regex is too long, it can be a problem ?
session = HTMLSession()
mailing=[]
for index, i in enumerate(link): #link is the list of the URLs
try:
r = session.get(i)
site=r.html.absolute_links
linkslist = list(r.html.absolute_links)
except:
linkslist=list(i)
for j in linkslist:
try:
r1 = session.get(j)
for re_match in re.finditer(EMAIL_REGEX, r1.html.raw_html.decode()):
mail=(re_match.group())
liste=[index,mail,j]
mailing.append(liste)
except:
pass
print(mailing)
df = pd.DataFrame(mailing, columns=['index1','mail','lien',])
Thank's for your help

I think multi-threading should do the job. your regex, i don't know what it does but assuming its working and helpful, the multi-threaded version should look like the following. I tested the code, it works.
`from threading import Thread, Lock
from requests_html import HTMLSession
import re
lock = Lock()
link = ["http://etsgaidonsarl.site-solocal.com/",
"http://fr-fr.facebook.com/people/",
"http://ipm-mondia.com/",
"http://lfgenieclimatique.fr/",
"http://vpcinstallation.site-solocal.com",
"http://www.cavifroid.fr/",
"http://www.clim-monnier.com/",
"http://www.climacool.net/"]
linklist = []
mailing = []
main_threads = []
minor_threads = []
EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?\.)+[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-zA-Z]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
def links_scraper(single_url):
try:
session = HTMLSession()
r = session.get(single_url)
site=r.html.absolute_links
the_list = list(r.html.absolute_links)
linklist.extend(list(zip([single_url for _ in range(len(the_list))], the_list)))
except Exception as e:
# print("Exception:", e)
linklist.append((single_url, single_url))
def mail_scrapper(main_url, single_link):
try:
session = HTMLSession()
r1 = session.get(single_link)
for re_match in re.finditer(EMAIL_REGEX, r1.html.raw_html.decode()):
mail=(re_match.group())
liste=[link.index(main_url),mail,single_link]
mailing.append(liste)
except Exception as e:
# print(f"Exception: {e}")
pass
def main():
for l in link:
t = Thread(target=links_scraper, args=(l,))
t.start()
main_threads.append(t)
while len(main_threads) > 0:
try:
with lock:
current_link = linklist.pop(0)
minor_thread = Thread(target=mail_scrapper, args=(current_link[0], current_link[1]))
minor_threads.append(minor_thread)
minor_thread.start()
except IndexError:
pass
for t in main_threads:
if t.isAlive() == False:
main_threads.pop(main_threads.index(t))
for t in minor_threads:
t.join()
main()
print("Mailing:", mailing)`

Related

Can't Stop ThreadPoolExecutor

I'm scraping hundreds of urls, each with a leaderboard of data I want, and the only difference between each url string is a 'platform','region', and lastly, the page number. There are only a few platforms and regions, but the page numbers change each day and I don't know how many there are. So that's the first function, I'm just creating lists of urls to be requested in parallel.
If I use page=1, then the result will contain 'table_rows > 0' in the last function. But around page=500, the requested url still pings back but very slowly and then it will show an error message, no leaderboard found, the last function will show 'table_rows == 0', etc. The problem is I need to get through the very last page and I want to do this quickly, hence the threadpoolexecutor - but I can't cancel all the threads or processes or whatever once PAGE_LIMIT is tripped. I threw the executor.shutdown(cancel_futures=True) just to kind of show what I'm looking for. If nobody can help me I'll miserably remove the parallelization and I'll scrape slowly, sadly, one url at a time...
Thanks
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import pandas
import requests
PLATFORM = ['xbl', 'psn', 'atvi', 'battlenet']
REGION = ['us', 'ca']
PAGE_LIMIT = True
def leaderboardLister():
global REGION
global PLATFORM
list_url = []
for region in REGION:
for platform in PLATFORM:
for i in range(1,750):
list_url.append('https://cod.tracker.gg/warzone/leaderboards/battle-royale/' + platform + '/KdRatio?country=' + region + '&page=' + str(i))
leaderboardExecutor(list_url,30)
def leaderboardExecutor(urls,threads):
global PAGE_LIMIT
global INTERNET
if len(urls) > 0:
with ThreadPoolExecutor(max_workers=threads) as executor:
while True:
if PAGE_LIMIT == False:
executor.shutdown(cancel_futures=True)
while INTERNET == False:
try:
print('bad internet')
requests.get("http://google.com")
INTERNET = True
except:
time.sleep(3)
print('waited')
executor.map(scrapeLeaderboardPage, urls)
def scrapeLeaderboardPage(url):
global PAGE_LIMIT
checkInternet()
try:
page = requests.get(url)
soup = BeautifulSoup(page.content,features = 'lxml')
table_rows = soup.find_all('tr')
if len(table_rows) == 0:
PAGE_LIMIT = False
print(url)
else:
pass
print('success')
except:
INTERNET = False
leaderboardLister()

Python check if website exists for a list of websites

I want to check if a website exists, given a list of websites in the format XXXXX.com, where XXXXX=a 5 digit number. So I want to go through from 00000 up to 99999 and see if those variants of the website exist.
I want to do something like
import requests
request = requests.get('http://www.example.com')
if request.status_code == 200:
print('Web site exists')
else:
print('Web site does not exist')
But generate a list of some sort (or even just export a list to csv), so for each URL, i know if it exists or not.
Any advice would be great!
I'm going to make an assumption that you have a large list of URLs and you want to read them in from some source file, let's say a text file, rather than hard-coding a large list of URLs in Python file, right. If that's the case, run the script below and you'll get what you want.
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
start = time.time()
file = open('C:\\your_path\\check_me.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
print(urls)
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: {}'.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: {}'.format(e.reason) + ', ' + url)
else:
# 200
# ...
print('good' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=20)
result = p.map(checkurl, urls)
print("done in : ", time.time()-start)
Try combining xrange and the string zfill method in a loop.
import requests
def test_for_200(url):
req = requests.get(url)
return req.status_code == 200
def numbers():
for n in xrange(100000):
yield str(n).zfill(5)
results = {}
for num in numbers():
url = "http://{}.com".format(num)
results[num] = test_for_200(url)
results will look something like this:
>>> results
{'00000': True, '00001': False, ...}

how to speed up my process

I wrote a script that will web scrape data for a list of stocks. The scraper has to get the data from 2 separate pages so each stock symbol must scrape 2 different pages. If I run the process on a list that is 1000 items long it will take around 30 minutes to complete. It's not horrible, I can set it and forget it, but I'm wondering if there is a way to speed up the process. Maybe store the data and wait to write it all at the end instead of on each loop? Any other ideas appreciated.
import requests
from BeautifulSoup import BeautifulSoup
from progressbar import ProgressBar
import csv
symbols = {'AMBTQ','AABA','AAOI','AAPL','AAWC','ABEC','ABQQ','ACFN','ACIA','ACIW','ACLS'}
pbar = ProgressBar()
with open('industrials.csv', "ab") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerow(['Symbol','5 Yr EPS','EPS TTM'])
for s in pbar(symbols):
try:
url1 = 'https://research.tdameritrade.com/grid/public/research/stocks/fundamentals?symbol='
full1 = url1 + s
response1 = requests.get(full1)
html1 = response1.content
soup1 = BeautifulSoup(html1)
for hist_div in soup1.find("div", {"data-module-name": "HistoricGrowthAndShareDetailModule"}):
EPS5yr = hist_div.find('label').text
except Exception as e:
EPS5yr = 'Bad Data'
pass
try:
url2 = 'https://research.tdameritrade.com/grid/public/research/stocks/summary?symbol='
full2 = url2 + s
response2 = requests.get(full2)
html2 = response2.content
soup2 = BeautifulSoup(html2)
for div in soup2.find("div", {"data-module-name": "StockSummaryModule"}):
EPSttm = div.findAll("dd")[11].text
except Exception as e:
EPSttm = "Bad data"
pass
writer.writerow([s,EPS5yr,EPSttm])

Python: requests hang for hours

I am using requests to resolve urls for about 410K check-in data. However, the process hang somewhere for hours and I am not sure where the problem is. I did the same thing for 1.7M pieces of data before and it worked well. Here is my code:
pat = re.compile("(?P<url>https?://[^\s]+)") # always compile it
def resolve_url(text):
url = 'before'
long_url = 'after'
error = 'none'
match = pat.search(text)
if match:
url = match.group("url")
try:
long_url = requests.head(url, allow_redirects=True).url
except requests.exceptions.RequestException as e:
error = e
return (url, long_url, error)
pool = multiprocessing.Pool(200)
resolved_urls = []
for i, res in enumerate(pool.imap(resolve_url, text_with_url)):
resolved_urls.append(res)
if i%10000 == 0 and i > 0:
print("%d elements have been processed, %2.5f seconds" %(i+1, time.time()-t0))
fout = open("./yangj/resolved_urls_%d_requests.pkl"%(i+1),"w")
pickle.dump(resolved_urls, fout)
fout.close()
resolved_urls = []
fout = open("./yangj/resolved_urls_last_requests.pkl","w")
pickle.dump(resolved_urls, fout)
fout.close()
I was wondering whether the problem is because of some exception that I need to write code to recover. I have looked through requests documents and previous similar questions but I didn't find matching answers. Any idea to solve the problem?

Fetching the first image from a website that belongs to the post

I've written a program that fetches the desired information from a blog or any page. The next thing, I want to achieve is to retrieve the first image from that page, that belongs to the respective post (Just like Facebook does when a post is shared).
I was able to achieve this to some extent by fetching the first image with an alt tag (since many websites don't have alt tags in their logos and icons etc, the first one should belong to the post). But this does not seem to work in some cases. Is there any other (better) way to achieve this?
I'm using python 2.7.9 and BeautifulSoup 4.
d = feedparser.parse('http://rss.cnn.com/rss/edition.rss')
for entry in d.entries:
try:
if entry.title is not None:
print entry.title
print ""
except Exception, e:
print e
try:
if entry.link is not None:
print entry.link
print ""
except Exception, e:
print e
try:
if entry.published[5:16] is not None:
print entry.published[5:16]
print ""
except Exception, e:
print e
try:
if entry.category is not None:
print entry.category
print ""
except Exception, e:
print e
try:
if entry.get('summary', '') is not None:
print entry.get('summary', '')
print ""
except Exception, e:
print e
time.sleep(5)
r = requests.get(entry.link, headers = {'User-Agent' : 'Safari/534.55.3 '})
soup = BeautifulSoup(r.text, 'html.parser')
for img in soup.findAll('img'):
if img.has_attr('alt'):
if img['src'].endswith('.jpg') == True or img['src'].endswith('.png') == True:
print img['src']
break
It is probably more practical to take a look at the opengraph module:
https://pypi.python.org/pypi/opengraph/0.5
and correct it the way you like.
It will fetch "first image" from HTML code or use og:image.
If you want to learn, you can also do it by looking at the source code. The module uses BeautifulSoup too.
I needed the following monkeypatch to activate scraping as fallback:
import re
from bs4 import BeautifulSoup
from opengraph import OpenGraph
def parser(self, html):
"""
"""
if not isinstance(html,BeautifulSoup):
doc = BeautifulSoup(html, from_encoding='utf-8')
else:
doc = html
ogs = doc.html.head.findAll(property=re.compile(r'^og'))
for og in ogs:
self[og[u'property'][3:]]=og[u'content']
# Couldn't fetch all attrs from og tags, try scraping body
if not self.is_valid() and self.scrape:
for attr in self.required_attrs:
if not hasattr(self, attr):
try:
self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
except AttributeError:
pass
OpenGraph.parser = parser
OpenGraph.scrape = True # workaround for some subtle bug in opengraph
You may need to handle relatives URLs in the image sources, but it is quite straightforward with use of urljoin from urlparse
import opengraph
...
page = opengraph.OpenGraph(url=link, scrape=True)
...
if page.is_valid():
...
image_url = page.get('image', None)
...
if not image_url.startswith('http'):
image_url = urljoin(page['_url'], page['image'])
(some check are omitted for brevity from the code fragment)

Categories

Resources