Improve Regex to catch complete emails from Google search? - python

In order to practice and help my sister get emails from doctors for her baby, I have designed this email harvester. It makes a search, cleans the urls given, adds them to a dictionary and parse them for emails in two different ways.
The code has been taken from different places, so if you correct me, please explain clearly your improvement, as I am working at the limit of my knowledge already.
The question is how to get emails better (and improve code, if possible). I'll post the code and the exact output below:
CODE of my program:
import requests, re, webbrowser, bs4
from selenium import webdriver
from bs4 import BeautifulSoup
import time, random, webbrowser
import urllib.request
def google_this(): #Googles and gets the first few links
search_terms = ['Fiat','Lambrusco']
added_terms = 'email contact? #'
#This searches for certain keywords in Google and parses results with BS
for el in search_terms:
webpage = 'http://google.com/search?q=' + str(el) + str(added_terms)
print('Searching for the terms...', el,added_terms)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
serp_res_rawlink = soup.select('.r a')
dicti = [] #This gets the href links
for link in serp_res_rawlink:
url = link.get('href')
if 'pdf' not in url:
dicti.append(url)
dicti_url = [] #This cleans the "url?q=" from link
for el in dicti:
if '/url?q=' in el:
result = (el.strip('/url?q='))
dicti_url.append(result)
#print(dicti_url)
dicti_pretty_links = [] #This cleans the gibberish at end of url
for el in dicti_url[0:4]:
pretty_url = el.partition('&')[0]
dicti_pretty_links.append(pretty_url)
print(dicti_pretty_links)
for el in dicti_pretty_links: #This converts page in BS soup
# browser = webdriver.Firefox()
# browser.get(el)
# print('I have been in the element below and closed the window')
# print(el)
# time.sleep(1)
# browser.close()
webpage = (el)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
#This is the first way to search for an email in soup
emailRegex = re.compile(r'([a-zA-Z0-9_.+]+#+[a-zA-Z0-9_.+])', re.VERBOSE)
mo = emailRegex.findall(res.text)
#mo = emailRegex.findall(soup.prettify())
print('THIS BELOW IS REGEX')
print(mo)
#This is the second way to search for an email in soup:
mailtos = soup.select('a[href^=mailto]')
for el in mailtos:
print('THIS BELOW IS MAILTOS')
print(el.text)
time.sleep(random.uniform(0.5,1))
google_this()
And here is the OUTPUT when this very same code above. As you can see, some emails seem to be found, but at cut just after the "#" symbol:
C:\Users\SK\AppData\Local\Programs\Python\Python35-32\python.exe C:/Users/SK/PycharmProjects/untitled/another_temperase.py
Searching for the terms... Fiat email contact? #
['http://www.fcagroup.com/en-US/footer/Pages/contacts.aspx', 'http://www.fiat.co.uk/header-contacts', 'http://www.fiatusa.com/webselfservice/fiat/', 'https://twitter.com/nic_fincher81/status/672505531689394176']
THIS BELOW IS REGEX
['investor.relations#f', 'investor.relations#f', 'sustainability#f', 'sustainability#f', 'mediarelations#f', 'mediarelations#f']
THIS BELOW IS MAILTOS
investor.relations#fcagroup.com
THIS BELOW IS MAILTOS
sustainability#fcagroup.com
THIS BELOW IS MAILTOS
mediarelations#fcagroup.com
THIS BELOW IS REGEX
[]
THIS BELOW IS REGEX
[]
THIS BELOW IS REGEX
['nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y']
Searching for the terms... Lambrusco email contact? #
['http://www.labattagliola.it/%3Flang%3Den']
Process finished with exit code 0

I would recommend a more restrictive version that still catches all of the email:
([a-zA-Z0-9_.+]+#[a-zA-Z0-9_.+]+)
The problem of not catching anything after the first letter after the # is because the regex is missing a +
([a-zA-Z0-9_.+]+#+[a-zA-Z0-9_.+]+)
Originally this part [a-zA-Z0-9_.+] simply said to catch one of any of the following characters a-z, A-Z, 0-9, ., _,+.
I would also be careful about #+ which says to catch 1 or more "#" symbols.
So a potentially valid email could look like this:
..................########################.................

Related

Unable to scrape emails from some websites maybe due to r.html.render() not working properly

I have some website links as samples for extracting any email available in their internal sites.
However, even I am trying to render any JS driven website via r.html.render() within scrape_email(url) method, some of the websites like arken.trygge.dk, gronnebakken.dk, dagtilbud.ballerup.dk/boernehuset-bispevangen etc. does not return any email which might be due to rendering issue.
I have attached the sample file for convenience of running
I dont want to use selenium as there can be thousands or millions of webpage I want to extract emails from.
So far this is my code:
import os
import time
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession
import pandas as pd
from gtts import gTTS
import winsound
# For convenience of seeing console output in the script
pd.options.display.max_colwidth = 180
#Get the start time of script execution
startTime = time.time()
#Paste file name inside ''
input_file_name = 'sample'
input_df = pd.read_excel(input_file_name+'.xlsx', engine='openpyxl')
input_df = input_df.dropna(how='all')
internal_urls = set()
emails = set()
total_urls_visited = 0
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_internal_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
print("Domain name -- ",domain_name)
try:
soup = BeautifulSoup(requests.get(url, timeout=5).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if parsed_href.netloc != domain_name:
# if the link is not of same domain pass
continue
if parsed_href.path.endswith((".csv",".xlsx",".txt", ".pdf", ".mp3", ".png", ".jpg", ".jpeg", ".svg", ".mov", ".js",".gif",".mp4",".avi",".flv",".wav")):
# Overlook site images,pdf and other file rather than webpages
continue
print(f"Internal link: {href}")
urls.add(href)
internal_urls.add(href)
return urls
except requests.exceptions.Timeout as err:
print("The website is not loading within 5 seconds... Continuing crawling the next one")
pass
except:
print("The website is unavailable. Continuing crawling the next one")
pass
def crawl(url, max_urls=30):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
print(f"Crawling: {url}")
links = get_internal_links(url)
# for link in links:
# if total_urls_visited > max_urls:
# break
# crawl(link, max_urls=max_urls)
def scrape_email(url):
EMAIL_REGEX = r'\b[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
# EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
try:
# initiate an HTTP session
session = HTMLSession()
# get the HTTP Response
r = session.get(url, timeout=10)
# for JAVA-Script driven websites
r.html.render()
single_url_email = []
for re_match in re.finditer(EMAIL_REGEX, r.html.raw_html.decode()):
single_url_email.append(re_match.group().lower())
r.session.close()
return set(single_url_email)
except:
pass
def crawl_website_scrape_email(url, max_internal_url_no=20):
crawl(url,max_urls=max_internal_url_no)
each_url_emails = []
global internal_urls
global emails
for each_url in internal_urls:
each_url_emails.append(scrape_email(each_url))
URL_WITH_EMAILS={'main_url': url, 'emails':each_url_emails}
emails = {}
internal_urls = set()
return URL_WITH_EMAILS
def list_check(emails_list, email_match):
match_indexes = [i for i, s in enumerate(emails_list) if email_match in s]
return [emails_list[index] for index in match_indexes]
URL_WITH_EMAILS_LIST = [crawl_website_scrape_email(x) for x in input_df['Website'].values]
URL_WITH_EMAILS_DF = pd.DataFrame(data = URL_WITH_EMAILS_LIST)
URL_WITH_EMAILS_DF.to_excel(f"{input_file_name}_email-output.xlsx", index=False)
How can I solve the issue of not being able to scrape email from some of those above-mentioned and similar type of websites?
Is there also any way to detect and print strings if my get request is refused by bot detector or related protocols?
Also how can I make this code more robust?
Thank you in advance

How to search multiple keywords in web page? this only input one keyword

import mechanize
from bs4 import BeautifulSoup
import time
import smtplib
True by default
while True:
url = "https://www.google.com"
browser = mechanize.Browser()
browser.open(url)
response = browser.response().read()
soup = BeautifulSoup(response, "lxml")
count = 1
if str(soup).find("English") == -1:
# wait 60 seconds (change the time(in seconds) as you wish),
print('Checking - ' + str(count) + 'th Time')
time.sleep(60)
count += 1
# continue with the script
continue
There is a couple of problems here:
Beautiful soup provide a method get_text() to extract the text, so you do not need to convert it to string.
String's find() return -1 when no value was found. Are you sure that is what you want?
Why do you use time.sleep()? What is the purpose of stopping the program?
You did not create a loop, which make count redundant and you will get error for continue.
If you want to get the number of occurence of a string, you can use regex's findall() and then get its length like: len(re.findall("English", soup_text)).
If you want to find multiple keywords, you can create a list of the keywords and then loop through them like:
for k in ["a", "b", "c"]:
print(f'{k}: {len(re.findall(k, soup.get_text()))}')
Full example:
from bs4 import BeautifulSoup
import requests # simple http request
import re # regex
url = "https://www.google.com"
doc = requests.get(url)
soup = BeautifulSoup(doc.text, "lxml")
soup_text = soup.get_text()
keywords = ["Google", "English", "a"]
for k in keywords:
print(f'{k}: {len(re.findall(k, soup_text))}')
You are strongly suggested to study python thoroughly:
Python: w3school tutorial
BeautifulSoup: Documentation
Regex: w3schools tutorial or RegExr

How to get all emails from a page individually

I am trying to get all emails from a specific page and separate them into an individual variable or even better a dictionary. This is some code.
import requests
import re
import json
from bs4 import BeautifulSoup
page = "http://www.example.net"
info = requests.get(page)
if info.status_code == 200:
print("Page accessed")
else:
print("Error accessing page")
code = info.content
soup = BeautifulSoup(code, 'lxml')
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
print(allEmails)
sep = ","
allEmailsStr = str(allEmails)
print(type(allEmails))
print(type(allEmailsStr))
j = allEmailsStr.split(sep, 1)[0]
print(j)
Excuse the poor variable names because I put this together so it would be fine by itself. The output from the example website would be for example something like
[k, kolyma, location, balkans]
So if I ran the problem it would return only
[k
But if I wanted it to return every email on there individually how would I do that?
To get just the email str you can try:
emails = []
for email_link in allEmails:
emails.append(email_link.get("href").replace('mailto:', ''))
print(emails)
Based on your expected output, you can use the unwrap function of BeautifulSoup
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
for Email in allEmails:
print(Email.unwrap()) #This will print the whole element along with tag
# k

Unable to scrape the conversation among debaters in order to put them in a dictionary

I've created a script to fetch all the conversation between different debaters excluding moderators. What I've written so far can fetch the total conversation. However, I would like to grab them like {speaker_name: (first speech, second speech) etc }.
Webpage link
another one similar to the above link
webpage link
I've tried so far:
import requests
from bs4 import BeautifulSoup
url = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas'
def get_links(link):
r = requests.get(link)
soup = BeautifulSoup(r.text,"lxml")
for item in soup.select(".field-docs-content p:has( > strong:contains('MODERATOR:')) ~ p"):
print(item.text)
if __name__ == '__main__':
get_links(url)
How can I scrape the conversation among debaters and put them in a dictionary?
I don't hold much hope for this lasting across lots of pages given the variability amongst the two pages I saw and the number of assumptions I have had to make. Essentially, I use regex on participant and moderators nodes text to isolate the lists of moderators and participants. I then loop all speech paragraphs and each time I encounter a moderator at the start of a paragraph I set a boolean variable store_paragraph = False and ignore subsequent paragraphs; likewise, each time I encounter a participant, I set store_paragraph = True and store that paragraph and subsequent ones under the appropriate participant key in my speaker_dict. I store each speaker_dict in a final results dictionary.
import requests, re
from bs4 import BeautifulSoup as bs
import pprint
links = ['https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas','https://www.presidency.ucsb.edu/documents/republican-presidential-candidates-debate-manchester-new-hampshire-0']
results = {}
p = re.compile(r'\b(\w+)\b\s+\(|\b(\w+)\b,')
with requests.Session() as s:
for number, link in enumerate(links):
r = s.get(link)
soup = bs(r.content,'lxml')
participants_tag = soup.select_one('p:has(strong:contains("PARTICIPANTS:"))')
if participants_tag.select_one('strong'):
participants_tag.strong.decompose()
speaker_dict = {i[0].upper() + ':' if i[0] else i[1].upper() + ':': [] for string in participants_tag.stripped_strings for i in p.findall(string)}
# print(speaker_dict)
moderator_data = [string for string in soup.select_one('p:has(strong:contains("MODERATOR:","MODERATORS:"))').stripped_strings][1:]
#print(moderator_data)
moderators = [i[0].upper() + ':' if i[0] else i[1].upper() + ':' for string in moderator_data for i in p.findall(string)]
store_paragraph = False
for paragraph in soup.select('.field-docs-content p:not(p:contains("PARTICIPANTS:","MODERATOR:"))')[1:]:
string_to_compare = paragraph.text.split(':')[0] + ':'
string_to_compare = string_to_compare.upper()
if string_to_compare in moderators:
store_paragraph = False
elif string_to_compare in speaker_dict:
speaker = string_to_compare
store_paragraph = True
if store_paragraph:
speaker_dict[speaker].append(paragraph.text)
results[number] = speaker_dict
pprint.pprint(results[1])

Python: unnable to get any output using beautifulsoup

I am trying to scrape some words from any random website, but the following program is not showing errors and not showing any output when i tried printing the results.
I have checked the code twice and even incorporated an if statement to see whether the program is getting any words or not.
import requests
import operator
from bs4 import BeautifulSoup
def word_count(url):
wordlist = []
source_code = requests.get(url)
source = BeautifulSoup(source_code.text, features="html.parser")
for post_text in source.findAll('a', {'class':'txt'}):
word_string=post_text.string
if word_string is not None:
word = word_string.lower().split()
for each_word in word:
print(each_word)
wordlist.append(each_word)
else:
print("None")
word_count('https://mumbai.craigslist.org/')
I am expecting all the words under the "class= txt" to be displayed in the output.
OP: I am expecting all the words of the class text to be displayed in the output
The culprit:
for post_text in source.findAll('a', {'class':'txt'}):
The reason:
anchor tag has no class txt but the span tag inside it does.
Hence:
import requests
from bs4 import BeautifulSoup
def word_count(url):
source_code = requests.get(url)
source=BeautifulSoup(source_code.text, features="html.parser")
for post_text in source.findAll('a'):
s_text = post_text.find('span', class_ = "txt")
if s_text is not None:
print(s_text.text)
word_count('https://mumbai.craigslist.org/')
OUTPUT:
community
activities
artists
childcare
classes
events
general
groups
local news
lost+found
missed connections
musicians
pets
.
.
.
You are targeting the wrong elements.
if you use
print(source)
Everything works fine but the moment you try to target the element with findAll you are targeting something wrong because you get an empty list array.
If you replace
for post_text in source.findAll('a', {'class':'txt'}):
with
for post_text in source.find_all('a'):
everyting seems to work fine
I have visited https://mumbai.craigslist.org/, and find there is no <a class="txt">, only <span class="txt">, so I think you can try this:
def word_count(url):
wordlist = []
source_code = requests.get(url)
source=BeautifulSoup(source_code.text, features="html.parser")
for post_text in source.findAll('span', {'class':'txt'}):
word_string=post_text.text
if word_string is not None:
word = word_string.lower().split ()
for each_word in word:
print(each_word)
wordlist.append(each_word)
else:
print("None")
it will output correctly:
community
activities
artists
childcare
classes
events
general
...
Hope that helps you, and comment if you have further questions. : )

Categories

Resources