Extract count of specific links from a web page. - python

I am writing a python script using BeautifulSoup. I need to scrape a website and count unique links ignoring the links starting with '#'.
Example if the following links exist on a webpage:
https://www.stackoverflow.com/questions
https://www.stackoverflow.com/foo
https://www.cnn.com/
For this example, the only two unique links will be (The link information after the main domain name is removed):
https://stackoverflow.com/ Count 2
https://cnn.com/ Count 1
Note: this is my first time using python and web scraping tools.
I appreciate all the help in advance.
This is what I have tried so far:
from bs4 import BeautifulSoup
import requests
url = 'https://en.wikipedia.org/wiki/Beautiful_Soup_(HTML_parser)'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
count = 0
for link in soup.find_all('a'):
print(link.get('href'))
count += 1

There is a function named urlparse from urllib.parse which you can get netloc of urls. And there is a new awesome HTTP library named requests_html which can help you get all links in source file.
from requests_html import HTMLSession
from collections import Counter
from urllib.parse import urlparse
session = HTMLSession()
r = session.get("the link you want to crawl")
unique_netlocs = Counter(urlparse(link).netloc for link in r.html.absolute_links)
for link in unique_netlocs:
print(link, unique_netlocs[link])

You could also do this:
from bs4 import BeautifulSoup
from collections import Counter
import requests
soup = BeautifulSoup(requests.get("https://en.wikipedia.org/wiki/Beautiful_Soup_(HTML_parser)").text, "html.parser")
foundUrls = Counter([link["href"] for link in soup.find_all("a", href=lambda href: href and not href.startswith("#"))])
foundUrls = foundUrls.most_common()
for item in foundUrls:
print ("%s: %d" % (item[0], item[1]))
The soup.find_all line checks if every atag has an href set and if it doesn't start with the # character.
The Counter method counts the occurrences of each list entry and the most_common orders by the value.
The for loop just prints the results.

My way to do this is to find all links using beautiful soup and then determine which link redirects to which location:
def get_count_url(url): # get the umber of links having the same domain and suffix
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
count = 0
urls={} #dictionary for the domains
# input_domain=url.split('//')[1].split('/')[0]
#library to extract the exact domain( ex.- blog.bbc.com and bbc.com have the same domains )
input_domain=tldextract.extract(url).domain+"."+tldextract.extract(url).suffix
for link in soup.find_all('a'):
word =link.get('href')
# print(word)
if word:
# Same website or domain calls
if "#" in word or word[0]=="/": #div call or same domain call
if not input_domain in urls:
# print(input_domain)
urls[input_domain]=1 #if first encounter with the domain
else:
urls[input_domain]+=1 #multiple encounters
elif "javascript" in word:
# javascript function calls (for domains that use modern JS frameworks to display information)
if not "JavascriptRenderingFunctionCall" in urls:
urls["JavascriptRenderingFunctionCall"]=1
else:
urls["JavascriptRenderingFunctionCall"]+=1
else:
# main_domain=word.split('//')[1].split('/')[0]
main_domain=tldextract.extract(word).domain+"." +tldextract.extract(word).suffix
# print(main_domain)
if main_domain.split('.')[0]=='www':
main_domain = main_domain.replace("www.","") # removing the www
if not main_domain in urls: # maintaining the dictionary
urls[main_domain]=1
else:
urls[main_domain]+=1
count += 1
for key, value in urls.items(): # printing the dictionary in a paragraph format for better readability
print(key,value)
return count
tld extract finds the correct url name and soup.find_all('a') finds a tags. The if statements check for same domain redirect, javascript redirect or other domain redirects.

Related

Removing duplicate links from scraper I'm making

#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup
import re
url = input("Please enter a URL to scrape: ")
r = requests.get(url)
html = r.text
print(html)
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all('a', attrs={'href': re.compile("^https://")}):
print(link.get('href'))
down at the bottom, where it prints the link... I know it'll go in there, but I can't think of a way to remove duplicate entries there. Can someone help me with that please?
Use a set to remove duplicates. You call add() to add an item and if the item is already present then it won't be added again.
Try this:
#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup
import re
url = input("Please enter a URL to scrape: ")
r = requests.get(url)
html = r.text
print(html)
soup = BeautifulSoup(html, "html.parser")
urls = set()
for link in soup.find_all('a', attrs={'href': re.compile(r"^https://")}):
urls.add(link.get('href'))
print(urls) # urls contains unique set of URLs
Note some URLs might start with http:// so may want to use the regexp ^https?:// to catch both http and https URLs.
You can also use set comprehension syntax to rewrite the assignment and for statements like this.
urls = {
link.get("href")
for link in soup.find_all("a", attrs={"href": re.compile(r"^https://")})
}
instead of printing it you need to catch is somehow to compare.
Try this:
you get a list with all result by find_all and make it a set.
data = set(link.get('href') for link in soup.find_all('a', attrs={'href': re.compile("^https://")}))
for elem in data:
print(elem)

Web scraping using python-beginner level

Hello I am new into python . practicing web scraping with some demo sites .
I am trying to scrape this website http://books.toscrape.com/ and want to extract
href
name/title
start rating/star-rating
price/price_color
in-stock availbility/instock availability
i written a basic code which goes to each book level.
but after that i am clueless as how i can extract those information.
import requests
from csv import reader,writer
from bs4 import BeautifulSoup
base_url= "http://books.toscrape.com/"
r = requests.get(base_url)
htmlContent = r.content
soup = BeautifulSoup(htmlContent,'html.parser')
for article in soup.find_all('article'):
This will find you the href and name for every book. You could also extract some other other information if you want.
import requests
from csv import reader,writer
from bs4 import BeautifulSoup
base_url= "http://books.toscrape.com/"
r = requests.get(base_url)
soup = BeautifulSoup(r.content,'html.parser')
def extract_info(soup):
href = []
for a in soup.find_all('a', href=True):
if a.text:
if "catalogue" in a["href"]:
href.append(a['href'])
name = []
for a in soup.find_all('a', title=True):
name.append(a.text)
return href, name
href, name = extract_info(soup)
print(href[0], name[0])
the output will be the href and name for the first book
Try below approach using python - requests and BeautifulSoup. I have fetched the page URL from website itself after inspecting the network section > Doc tab of google chrome browser.
What exactly below script is doing:
First it will take the Page URL which is created using, page no parameter and then doing a GET request.
URL is dynamic which will get created after finishing of an iteration. You will notice that PAGE_NO param will get incremented after each iteration.
After getting the data script will parse the HTML code using html5.parser library.
Finally it will iterate all over the list of books fetched in each iteration or page for ex:- Title, Hyperlink, Price, Stock Availability and rating.
There are 50 pages and 1k results below script will extract all the books details one page per iteration
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
def scrap_books_data():
PAGE_NO = 1 # Page no parameter which will get incremented after every iteration
while True:
print('Creating URL to scrape books data for ', str(PAGE_NO))
URL = 'http://books.toscrape.com/catalogue/page-' + str(PAGE_NO) + '.html' #dynamic URL which will get created after every iteration
response = requests.get(URL,verify=False) # GET request to fetch data from site
soup = bs(response.text,'html.parser') #Parse HTML data using 'html5.parser'
extracted_books_data = soup.find_all('article', class_ = 'product_pod') # find all articles tag where book details are nested
if len(extracted_books_data) == 0: #break the loop and exit from the script if there in no more data available to process
break
else:
for item in range(len(extracted_books_data)): #iterate over the list of extracted books
print('-' * 100)
print('Title : ', extracted_books_data[item].contents[5].contents[0].attrs['title'])
print('Link : ', extracted_books_data[item].contents[5].contents[0].attrs['href'])
print('Rating : ', extracted_books_data[item].contents[3].attrs['class'][1])
print('Price : ', extracted_books_data[item].contents[7].contents[1].text.replace('Â',''))
print('Availability : ', extracted_books_data[item].contents[7].contents[3].text.replace('\n','').strip())
print('-' * 100)
PAGE_NO += 1 #increment page no by 1 to scrape next page data
scrap_books_data()

Beautifulsoup "findAll()" does not return the tags

I am trying to build a scraper to get some abstracts of academic papers and their corresponding titles on this page.
The problem is that my for link in bsObj.findAll('a',{'class':'search-track'}) does not return the links I need to further build my scraper. In my code, the check is like this:
for link in bsObj.findAll('a',{'class':'search-track'}):
print(link)
The for loop above does print out anything, however, the href links should be inside the <a class="search-track" ...</a>.
I have referred to this post, but changing the Beautifulsoup parser is not solving the problem of my code. I am using "html.parser" in my Beautifulsoup constructor: bsObj = bs(html.content, features="html.parser").
And the print(len(bsObj)) prints out "3" while it prints out "2" for both "lxml" and "html5lib".
Also, I started off using urllib.request.urlopen to get the page and then tried requests.get() instead. Unfortunately the two approaches give me the same bsObj.
Here is the code I've written:
#from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup as bs
import ssl
'''
The elsevier search is kind of a tree structure:
"keyword --> a list of journals (a journal contain many articles) --> lists of articles
'''
address = input("Please type in your keyword: ") #My keyword is catalyst for water splitting
#https://www.elsevier.com/en-xs/search-results?
#query=catalyst%20for%20water%20splitting&labels=journals&page=1
address = address.replace(" ", "%20")
address = "https://www.elsevier.com/en-xs/search-results?query=" + address + "&labels=journals&page=1"
journals = []
articles = []
def getJournals(url):
global journals
#html = urlopen(url)
html = requests.get(url)
bsObj = bs(html.content, features="html.parser")
#print(len(bsObj))
#testFile = open('testFile.txt', 'wb')
#testFile.write(bsObj.text.encode(encoding='utf-8', errors='strict') +'\n'.encode(encoding='utf-8', errors='strict'))
#testFile.close()
for link in bsObj.findAll('a',{'class':'search-track'}):
print(link)
########does not print anything########
'''
if 'href' in link.attrs and link.attrs['href'] not in journals:
newJournal = link.attrs['href']
journals.append(newJournal)
'''
return None
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
getJournals(address)
print(journals)
Can anyone tell me what the problem is in my code that the for loop does not print out any links? I need to store the links of journals in a list and then visit each link to scrape the abstracts of papers. By right the abstracts part of a paper is free and the website shouldn't have blocked my ID because of it.
This page is dynamically loaded with jscript, so Beautifulsoup can't handle it directly. You may be able to do it using Selenium, but in this case you can do it by tracking the api calls made by the page (for more see, as one of many examples, here.
In your particular case it can be done this way:
from bs4 import BeautifulSoup as bs
import requests
import json
#this is where the data is hiding:
url = "https://site-search-api.prod.ecommerce.elsevier.com/search?query=catalyst%20for%20water%20splitting&labels=journals&start=0&limit=10&lang=en-xs"
html = requests.get(url)
soup = bs(html.content, features="html.parser")
data = json.loads(str(soup))#response is in json format so we load it into a dictionary
Note: in this case, it's also possible to dispense with Beautifulsoup altogether and load the response directly, as in data = json.loads(html.content). From this point:
hits = data['hits']['hits']#target urls are hidden deep inside nested dictionaries and lists
for hit in hits:
print(hit['_source']['url'])
Ouput:
https://www.journals.elsevier.com/water-research
https://www.journals.elsevier.com/water-research-x
etc.

Searching unique web links

I wrote the program to extract the web links from http://www.stevens.edu/.
Now i am facing following problems with the program.
1- i want to get only links starting from http and https
2 - I am getting a parser warning from bs4 concerning the lack of specification on a parser - solved
How can fix this problems? I am not getting proper direction to solve this problem.
my code is -
import urllib2
from bs4 import BeautifulSoup as bs
url = raw_input('Please enter the url for which you want to see unique web links -')
print "\n"
URLs (mostly HTTP) in a complex world
req = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urllib2.urlopen(req).read()
soup = bs(html)
tags = soup('a')
count = 0
web_link = []
for tag in tags:
count = count + 1
store = tag.get('href', None)
web_link.append(store)
print "Total no. of extracted web links are",count,"\n"
print web_link
print "\n"
Unique_list = set(web_link)
Unique_list = list(Unique_list)
print "No. of the Unique web links after using set method", len(Unique_list),"\n"
for second problem, you need to specify the parser while creating a bs of the page.
soup = bs(html,"html.parser")
This should remove your warning.

Python web Automation to get Email from Webpage

I want a python script that opens a link and print the email address from that page.
E.g
Go to some site like example.com
Search for email in that.
Search in all the pages in that link.
I was tried below code
import requests
from bs4 import BeautifulSoup
r = requests.get('http://www.digitalseo.in/')
data = r.text
soup = BeautifulSoup(data)
for rate in soup.find_all('#'):
print rate.text
I take this website for reference.
Anyone help me to get this?
Because find_all() will only search Tags. From document:
Signature: find_all(name, attrs, recursive, string, limit, **kwargs)
The find_all() method looks through a tag’s descendants and retrieves all descendants that match your filters.
So you need add a keyword argument like this:
import re
import requests
from bs4 import BeautifulSoup
r = requests.get('http://www.digitalseo.in/')
data = r.text
soup = BeautifulSoup(data, "html.parser")
for i in soup.find_all(href=re.compile("mailto")):
print i.string
Demo:
contact#digitalseo.in
contact#digitalseo.in
From document:
Any argument that’s not recognized will be turned into a filter on one of a tag’s attributes. If you pass in a value for an argument called id, Beautiful Soup will filter against each tag's 'id' attribute:
soup.find_all(id='link2')
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
If you pass in a value for href, Beautiful Soup will filter against each tag's 'href' attribute:
soup.find_all(href=re.compile("elsie"))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
You can see the document for more info: http://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all
And if you'd like find the email address from a document, regex is a good choice.
For example:
import re
re.findall( '[^#]+#[^#]+\.[^#]+ ', text) # remember change `text` variable
And if you'd like find a link in a page by keyword, just use .get like this:
import re
import requests
from bs4 import BeautifulSoup
def get_link_by_keyword(keyword):
links = set()
for i in soup.find_all(href=re.compile(r"[http|/].*"+str(keyword))):
links.add(i.get('href'))
for i in links:
if i[0] == 'h':
yield i
elif i[0] == '/':
yield link+i
else:
pass
global link
link = raw_input('Please enter a link: ')
if link[-1] == '/':
link = link[:-1]
r = requests.get(link, verify=True)
data = r.text
soup = BeautifulSoup(data, "html.parser")
for i in get_link_by_keyword(raw_input('Enter a keyword: ')):
print i

Categories

Resources