Extract domain name from URL in Python - python

I am tring to extract the domain names out of a list of URLs. Just like in
https://stackoverflow.com/questions/18331948/extract-domain-name-from-the-url
My problem is that the URLs can be about everything, few examples:
m.google.com => google
m.docs.google.com => google
www.someisotericdomain.innersite.mall.co.uk => mall
www.ouruniversity.department.mit.ac.us => mit
www.somestrangeurl.shops.relevantdomain.net => relevantdomain
www.example.info => example
And so on..
The diversity of the domains doesn't allow me to use a regex as shown in how to get domain name from URL (because my script will be running on enormous amount of urls from real network traffic, the regex will have to be enormous in order to catch all kinds of domains as mentioned).
Unfortunately my web research the didn't provide any efficient solution.
Does anyone have an idea of how to do this ?
Any help will be appreciated !
Thank you

Use tldextract which is more efficient version of urlparse, tldextract accurately separates the gTLD or ccTLD (generic or country code top-level domain) from the registered domain and subdomains of a URL.
>>> import tldextract
>>> ext = tldextract.extract('http://forums.news.cnn.com/')
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
>>> ext.domain
'cnn'

It seems you can use urlparse https://docs.python.org/3/library/urllib.parse.html for that url, and then extract the netloc.
And from the netloc you could easily extract the domain name by using split

Simple solution via regex
import re
def domain_name(url):
return url.split("www.")[-1].split("//")[-1].split(".")[0]

With regex, you could use something like this:
(?<=\.)([^.]+)(?:\.(?:co\.uk|ac\.us|[^.]+(?:$|\n)))
https://regex101.com/r/WQXFy6/5
Notice, you'll have to watch out for special cases such as co.uk.

Check the replace and split methods.
PS: ONLY WORKS FOR SIMPLE LINKS LIKE https://youtube.com (output=youtube) AND (www.user.ru.com) (output=user)
def domain_name(url):
return url.replace("www.","http://").split("//")[1].split(".")[0]

import re
def getDomain(url:str) -> str:
'''
Return the domain from any url
'''
# copy the original url text
clean_url = url
# take out protocol
reg = re.findall(':[0-9]+',url)
if len(reg) > 0:
url = url.replace(reg[0],'')
# take out paths routes
if '/' in url:
url = url.split('/')
# select only the domain
if 'http' in clean_url:
url = url[2]
# preparing for next operation
url = ''.join(url)
# select only domain
url = '.'.join(url.split('.')[-2:])
return url

from urllib.parse import urlparse
import validators
hostnames = []
counter = 0
errors = 0
for row_orig in rows:
try:
row = row_orig.rstrip().lstrip().split(' ')[1].rstrip()
if len(row) < 5:
print(f"Empty row {row_orig}")
errors += 1
continue
if row.startswith('http'):
domain = urlparse(row).netloc # works for https and http
else:
domain = row
if ':' in domain:
domain = domain.split(':')[0] # split at port after clearing http/https protocol
# Finally validate it
if validators.domain(domain):
pass
elif validators.ipv4(domain):
pass
else:
print(f"Invalid domain/IP {domain}. RAW: {row}")
errors +=1
continue
hostnames.append(domain)
if counter % 10000 == 1:
print(f"Added {counter}. Errors {errors}")
counter+=1
except:
print("Error in extraction")
errors += 1

For extracting domain from url
from urllib.parse import urlparse
url = "https://stackoverflow.com/questions/44021846/extract-domain-name-from-url-in-python"
domain = urlparse(url).netloc
"stackoverflow.com"
For check domain is exist in url
if urlparse(url).netloc in ["domain1", "domain2", "domain3"]:
do something

Related

How to get all emails from a page individually

I am trying to get all emails from a specific page and separate them into an individual variable or even better a dictionary. This is some code.
import requests
import re
import json
from bs4 import BeautifulSoup
page = "http://www.example.net"
info = requests.get(page)
if info.status_code == 200:
print("Page accessed")
else:
print("Error accessing page")
code = info.content
soup = BeautifulSoup(code, 'lxml')
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
print(allEmails)
sep = ","
allEmailsStr = str(allEmails)
print(type(allEmails))
print(type(allEmailsStr))
j = allEmailsStr.split(sep, 1)[0]
print(j)
Excuse the poor variable names because I put this together so it would be fine by itself. The output from the example website would be for example something like
[k, kolyma, location, balkans]
So if I ran the problem it would return only
[k
But if I wanted it to return every email on there individually how would I do that?
To get just the email str you can try:
emails = []
for email_link in allEmails:
emails.append(email_link.get("href").replace('mailto:', ''))
print(emails)
Based on your expected output, you can use the unwrap function of BeautifulSoup
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
for Email in allEmails:
print(Email.unwrap()) #This will print the whole element along with tag
# k

python-Using urllib to retrieve web content but got differet content from what I got using browser

I want to write an translation api using this site, which has many desirable features when deal with sentences with wildcards.
First I use F12 in chrome to see what request url is using to produce the result.
I checked that only salt and sigh changed when I use different inputs.
So I look the js source code to see how salt and sigh were produced.
Then I use python library urllib to send the request and get the response. But the response translation was not the same when I use the browser to get it. For example,
Input :"what album was #head_entity# released on?"
Output_browser: "#head_entity#发布了什么专辑?"
Output_python:"发布的专辑是什么# head_entity?#"
which is clearly different.
This is the code for producing my result:
import urllib.request
import urllib.parse
import json
import time
import random
import hashlib
def translator(content):
"""arg:content"""
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
data = {}
u = 'fanyideskweb'
d = content
f = str(int(time.time()*1000) + random.randint(1,10))
c = 'rY0D^0\'nM0}g5Mm1z%1G4'
sign = hashlib.md5((u + d + f + c).encode('utf-8')).hexdigest()
data['i'] = content
data['from'] = 'AUTO'
data['to'] = 'AUTO'
data['smartresult'] = 'dict'
data['client'] = 'fanyideskweb'
data['salt'] = f
data['sign'] = sign
data['doctype'] = 'json'
data['version'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['action'] = 'FY_BY_CL1CKBUTTON'
data['typoResult'] = 'true'
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=url,data=data,method='POST')
response = urllib.request.urlopen(request)
d = json.loads(response.read().decode('utf-8'))
return d['translateResult'][0][0]['tgt']
translator('what album was #head_entity# released on?')
The only thing I think I changed to make the request different to the original page was the url argument in the code:
My_url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
Original_url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule' which gave me an error {"errorCode":50}
I checked the header and data parameters one by one but still can't solve the problem. I have no idea why this happened. Any ideas?

Improve Regex to catch complete emails from Google search?

In order to practice and help my sister get emails from doctors for her baby, I have designed this email harvester. It makes a search, cleans the urls given, adds them to a dictionary and parse them for emails in two different ways.
The code has been taken from different places, so if you correct me, please explain clearly your improvement, as I am working at the limit of my knowledge already.
The question is how to get emails better (and improve code, if possible). I'll post the code and the exact output below:
CODE of my program:
import requests, re, webbrowser, bs4
from selenium import webdriver
from bs4 import BeautifulSoup
import time, random, webbrowser
import urllib.request
def google_this(): #Googles and gets the first few links
search_terms = ['Fiat','Lambrusco']
added_terms = 'email contact? #'
#This searches for certain keywords in Google and parses results with BS
for el in search_terms:
webpage = 'http://google.com/search?q=' + str(el) + str(added_terms)
print('Searching for the terms...', el,added_terms)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
serp_res_rawlink = soup.select('.r a')
dicti = [] #This gets the href links
for link in serp_res_rawlink:
url = link.get('href')
if 'pdf' not in url:
dicti.append(url)
dicti_url = [] #This cleans the "url?q=" from link
for el in dicti:
if '/url?q=' in el:
result = (el.strip('/url?q='))
dicti_url.append(result)
#print(dicti_url)
dicti_pretty_links = [] #This cleans the gibberish at end of url
for el in dicti_url[0:4]:
pretty_url = el.partition('&')[0]
dicti_pretty_links.append(pretty_url)
print(dicti_pretty_links)
for el in dicti_pretty_links: #This converts page in BS soup
# browser = webdriver.Firefox()
# browser.get(el)
# print('I have been in the element below and closed the window')
# print(el)
# time.sleep(1)
# browser.close()
webpage = (el)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
#This is the first way to search for an email in soup
emailRegex = re.compile(r'([a-zA-Z0-9_.+]+#+[a-zA-Z0-9_.+])', re.VERBOSE)
mo = emailRegex.findall(res.text)
#mo = emailRegex.findall(soup.prettify())
print('THIS BELOW IS REGEX')
print(mo)
#This is the second way to search for an email in soup:
mailtos = soup.select('a[href^=mailto]')
for el in mailtos:
print('THIS BELOW IS MAILTOS')
print(el.text)
time.sleep(random.uniform(0.5,1))
google_this()
And here is the OUTPUT when this very same code above. As you can see, some emails seem to be found, but at cut just after the "#" symbol:
C:\Users\SK\AppData\Local\Programs\Python\Python35-32\python.exe C:/Users/SK/PycharmProjects/untitled/another_temperase.py
Searching for the terms... Fiat email contact? #
['http://www.fcagroup.com/en-US/footer/Pages/contacts.aspx', 'http://www.fiat.co.uk/header-contacts', 'http://www.fiatusa.com/webselfservice/fiat/', 'https://twitter.com/nic_fincher81/status/672505531689394176']
THIS BELOW IS REGEX
['investor.relations#f', 'investor.relations#f', 'sustainability#f', 'sustainability#f', 'mediarelations#f', 'mediarelations#f']
THIS BELOW IS MAILTOS
investor.relations#fcagroup.com
THIS BELOW IS MAILTOS
sustainability#fcagroup.com
THIS BELOW IS MAILTOS
mediarelations#fcagroup.com
THIS BELOW IS REGEX
[]
THIS BELOW IS REGEX
[]
THIS BELOW IS REGEX
['nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y']
Searching for the terms... Lambrusco email contact? #
['http://www.labattagliola.it/%3Flang%3Den']
Process finished with exit code 0
I would recommend a more restrictive version that still catches all of the email:
([a-zA-Z0-9_.+]+#[a-zA-Z0-9_.+]+)
The problem of not catching anything after the first letter after the # is because the regex is missing a +
([a-zA-Z0-9_.+]+#+[a-zA-Z0-9_.+]+)
Originally this part [a-zA-Z0-9_.+] simply said to catch one of any of the following characters a-z, A-Z, 0-9, ., _,+.
I would also be careful about #+ which says to catch 1 or more "#" symbols.
So a potentially valid email could look like this:
..................########################.................

For loops with user input on Python

Hello I'm learning how to parse HTML with BeautifulSoup. I would like to know if it is possible to use a user input in a for loop, as:
for (user input) in A
As A is a list of links so the user can choose to go for a link, using an input.
And then I use urllib to open that link and repeat the process.
You can use something like this:
import urllib2
from bs4 import BeautifulSoup
choice = ''
for url in urls:
print('Go to {}?'.format(url))
decision = input('Y/n ')
if decision == 'Y':
choice = url
break
if choice:
r = urllib2.urlopen(choice).read()
soup = BeautifulSoup(r, 'lxml')
# do something else
It wasn't exactly clear to me if you really wanted to "open" the link in a browser, so I included some code to do that. Is this maybe what you wanted from "digit a position"?
tl;dr
print("Which URL would you like to open?"
" (Please select an option between 1-{})".format(len(A)))
for index, link in enumerate(A):
print index+1, link
Full:
from bs4 import BeautifulSoup
import requests
import webbrowser
A = [
'https://www.google.com',
'https://www.stackoverflow.com',
'https://www.xkcd.com',
]
print("Which URL would you like to open?"
" (Please select an option between 1-{})".format(len(A)))
for index, link in enumerate(A):
print index+1, link
_input = input()
try:
option_index = int(_input) - 1
except ValueError:
print "{} is not a valid choice.".format(_input)
raise
try:
selection = A[option_index]
except IndexError:
print "{} is not a valid choice.".format(_input)
raise
webbrowser.open(selection)
response = requests.get(selection)
html_string = response.content
# Do parsing...
Thanks for your help. I achieved a solution on this.
Created two variables: count = input() and postion = input()
The count I have used in a for loop: for _ in range(c) - with this I can made a process repeat the number of times that the user want (on this assignement is 4).
The position (that for this assignement is predefined on 3), I use for list index, in a list with all url. So for open the url in position 3 I have:
url = links[p-1] (-1 for the reason that user inputs 3, but the list index starts with 0 (0,1,2...)
And then I can use urllib.request.urlopen.read()

python mechanize follow_link fails

I'm trying to access search results on the NCBI Images search page (http://www.ncbi.nlm.nih.gov/images) in a script. I want to feed it a search term, report on all of the results, and then move on to the next search term. To do this I need to get to results pages after the first page, so I'm trying to use python mechanize to do it:
import mechanize
browser=mechanize.Browser()
page1=browser.open('http://www.ncbi.nlm.nih.gov/images?term=drug')
a=browser.links(text_regex='Next')
nextlink=a.next()
page2=browser.follow_link(nextlink)
This just gives me back the first page of search results again (in variable page2). What am I doing wrong, and how can I get to that second page and beyond?
Unfortunately that page uses Javascript to POST 2459 bytes of form variables to the server, just to navigate to a subsequent page. Here are a few of the variables (I count 38 vars in total):
EntrezSystem2.PEntrez.ImagesDb.Images_SearchBar.Term=drug
EntrezSystem2.PEntrez.ImagesDb.Images_SearchBar.CurrDb=images
EntrezSystem2.PEntrez.ImagesDb.Images_ResultsPanel.Entrez_Pager.CurrPage=2
You'll need to construct a POST request to the server containing some or all of these variables. Luckily if you get it working for page 2 you can simply increment CurrPage and send another POST to get each subsequent page of results (no need to extract links).
Update - That site is a total pain-in-the-ass, but here is a POST-based scrape of the 2-N pages. Set MAX_PAGE to the highest page number + 1. The script will produce files like file_000003.html.
Note: Before you use it, you need to replace POSTDATA with the contents of this paste blob (it expires in 1 month). It's just the body a POST request as captured by Firebug, which I use to seed the correct params:
import cookielib
import json
import mechanize
import sys
import urllib
import urlparse
MAX_PAGE = 6
TERM = 'drug'
DEBUG = False
base_url = 'http://www.ncbi.nlm.nih.gov/images?term=' + TERM
browser = mechanize.Browser()
browser.set_handle_robots(False)
browser.set_handle_referer(True)
browser.set_debug_http(DEBUG)
browser.set_debug_responses(DEBUG)
cjar = cookielib.CookieJar()
browser.set_cookiejar(cjar)
# make first GET request. this will populate the cookie
res = browser.open(base_url)
def write(num, data):
with open('file_%06d.html' % num, 'wb') as out:
out.write(data)
def encode(kvs):
res = []
for key, vals in kvs.iteritems():
if isinstance(vals, list):
for v in vals:
res.append('%s=%s' % (key, urllib.quote(v)))
else:
res.append('%s=%s' % (key, urllib.quote(vals)))
return '&'.join(res)
write(1, res.read())
# set this var equal to the contents of this: http://pastebin.com/UfejW3G0
POSTDATA = '''<post data>'''
# parse the embedded json vars into POST parameters
PREFIX1 = 'EntrezSystem2.PEntrez.ImagesDb.'
PREFIX2 = 'EntrezSystem2.PEntrez.DbConnector.'
params = dict((k, v[0]) for k, v in urlparse.parse_qs(POSTDATA).iteritems())
base_url = 'http://www.ncbi.nlm.nih.gov/images'
for page in range(2, MAX_PAGE):
params[PREFIX1 + 'Images_ResultsPanel.Entrez_Pager.CurrPage'] = str(page)
params[PREFIX1 + 'Images_ResultsPanel.Entrez_Pager.cPage'] = [str(page-1)]*2
data = encode(params)
req = mechanize.Request(base_url, data)
cjar.add_cookie_header(req)
req.add_header('Content-Type', 'application/x-www-form-urlencoded')
req.add_header('Referer', base_url)
res = browser.open(req)
write(page, res.read())

Categories

Resources