how to list unique urls from the given domain - python

I have written code to extract all urls from the given site, but the problem is some urls are repeated, and I want it a list with unique urls.
from bs4 import BeautifulSoup
from termcolor import colored
import re, os
import requests
url = 'http://example.com'
ext = 'html'
count=0
countfiles=0
files=[]
def ulist(x):
return list(dict.fromkeys(x))
def listFD(filename, ext=''):
print filename
print url
if filename == url:
page = requests.get(url).text
else:
page = requests.get(url + filename).text
soup = BeautifulSoup(page, 'html.parser')
return ['/' + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]
for file in ulist(listFD(url, ext)):
for unfile in ulist(listFD(file, ext)):
print unfile

You can act below:
urls = list(set(urls))

Just wrap your list in python's builtin set functionality:
urls = ['www.google.com', 'www.google.com', 'www.facebook.com']
unique_urls = list(set(urls))
print(unique_urls) # prints >> ['www.facebook.com', 'www.google.com']

Once you have your list of urls, you can use a set to get unique elements and a list comprehension:
unique_urls = [url for url in set(urls)]

Related

Using BeautifulSoup to grab the second part of a URL and store that text in a variable

I have a list of urls that all have the same first part of the url. All the urls have 'ingredient-disclosure' with the product category coming after seperated by a /. I want to create a list that contains all the product categories.
So for the given url, I want to grab the text 'commercial-professional' and store it in a list that contains all the product categories.
Here is one of the urls: https://churchdwight.com/ingredient-disclosure/commercial-professional/42000024-ah-trash-can-dumpster-deodorizer.aspx
Thank you for any help!
You might want to consider using a Python set to store the categories so you end up with one of each.
Try the following example that uses their index page to get possible links:
import requests
from bs4 import BeautifulSoup
import csv
url = "https://churchdwight.com/ingredient-disclosure/"
req = requests.get(url)
soup = BeautifulSoup(req.content, "html.parser")
categories = set()
for a_tag in soup.find_all("a", href=True):
url_parts = [p for p in a_tag["href"].split('/') if p]
if len(url_parts) > 2 and url_parts[0] == "ingredient-disclosure":
categories.update([url_parts[1]])
print("\n".join(sorted(categories)))
This would give you the following categories:
Nausea-Relief
antiperspirant-deodorant
cleaning-products
commercial-professional
cough-allergy
dental-care
depilatories
fabric-softener-sheets
feminine-hygiene
hair-care
hand-sanitizer
hemorrhoid-relief
laundry-fabric-care
nasal-care
oral-care
pain-relief
pet-care
pool-products
sexual-health
skin-care
wound-care
You split the urls on the "/" character and get whatever you need from the resulting list:
prod_cat_list = []
url = 'https://churchdwight.com/ingredient-disclosure/commercial-professional/42000024-ah-trash-can-dumpster-deodorizer.aspx'
parts = url.split('/')
domain = parts[2]
prod_category = parts[4]
prod_cat_list.append(prod_category)
print(prod_cat_list)

scraping links from wikipedia

So i am trying to scrape links from a random wikipedia page here is my code thus far:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib2
# function get random page
def get_random():
import requests
# r = requests.get('https://en.wikipedia.org/wiki/Special:Random')
r = requests.get('https://en.wikipedia.org/wiki/Carole_Ann')
return r.url
#========================
#finding the valid link
def validlink(href):
if href:
if re.compile('^/wiki/').search(href):
if not re.compile('/\w+:').search(href):
return True
return False
#validlink()===========
#the first site
a1 = get_random()
#print("the first site is: " + a1)
# the first site end()====
#looking for the article name:
blin = requests.get(a1)
soup = BeautifulSoup(blin.text, 'html.parser')
title = soup.find('h1', {'class' : 'firstHeading'})
print("starting website: " + a1 + " Titled: " + title.text)
print("")
#=============================
#first article done
#find body:
import re
body = requests.get(a1).text
soup = BeautifulSoup(body, 'lxml')
for link in soup.findAll("a"):
url = link.get("href", "")
print(
#======================
i know i'm doing this last part wrong. Im new to python so i just have no idea how to go about this part, what i need is to pull all of the links from a random site that the random page takes me to, then i pull the link and title off of that site,
then i need to pull the wikipedia links off of that page which is what i am looking to do in that last bit of code there heres another snip:
and at this point i want to print all of the links that it finds after they have been tested against my valid links function at the top:
again forgive me for being new and not understanding at this. But please help i cannot figure this out.
so the question that i have is: i need to create a snippet of code that will pull out all of the website links off of the wikipedia page (which note i still dont know how to do the for loop was my best guess based on my own research) then i need to test the links that i pulled against my validlink function, and print out all of the valid links.
If you whan it as list then create new list and append() url if it is valid.
Because the same url can be many times on page so I also check if url is already on list.
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and validlink(url):
valid_urls.append(url)
print(valid_urls)
from bs4 import BeautifulSoup
import requests
import re
# --- functions ---
def is_valid(url):
"""finding the valid link"""
if url:
if url.startswith('/wiki/'): # you don't need `re` to check it
if not re.compile('/\w+:').search(url):
return True
return False
# --- main ---
#random_url = 'https://en.wikipedia.org/wiki/Special:Random'
random_url = 'https://en.wikipedia.org/wiki/Carole_Ann'
r = requests.get(random_url)
print('url:', r.url)
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('h1', {'class': 'firstHeading'})
print('starting website:', r.url)
print('titled:', title.text)
print()
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and is_valid(url):
valid_urls.append(url)
#print(valid_urls)
#for url in valid_urls:
# print(url)
print('\n'.join(valid_urls))

How to scrape embedded integers on a website

I'm trying to scrape the number of likes for the datasets available on this website.
I've been unable to workout a way of reliably identifying and scraping the relationship between the dataset title and the like integer:
as it is embedded in the HTML as below:
I have used a scraper previously to get information about the resource urls. In that case I was able to capture the last child a of parent h3 with a parent having class .dataset-item.
I would like to adapt my existing code to scrape the number of likes for each resource in the catalogue, rather than the URLs. Below is the code for the url scraper I used:
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
for page in range(1,2): #set number of pages
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)
My desired output would appear like this:
The approach is pretty straight forward. Your given website contains required elements in a list Tag. And what you need to do, is to get source code of that <li> tag, and just fetch Heading, which has a certain class and Same goes for like count.
The catch in like count is, the text comprises of some noise. To fix that, you can use regular expression to extract digits ('\d+') from given input of likes count. Following code gives desired result:
from bs4 import BeautifulSoup as soup
import requests
import re
import pandas as pd
source = requests.get('https://data.nsw.gov.au/data/dataset')
sp = soup(source.text,'lxml')
element = sp.find_all('li',{'class':"dataset-item"})
heading = []
likeList = []
for i in element:
try:
header = i.find('a',{'class':"searchpartnership-url-analytics"})
heading.append(header.text)
except:
header = i.find('a')
heading.append(header.text)
like = i.find('span',{'id':'likes-count'})
likeList.append(re.findall('\d+',like.text)[0])
dict = {'Title': heading, 'Likes': likeList}
df = pd.DataFrame(dict,index=False)
print(df)
Hope it helped!
You could use the following.
I am using a css selector with Or syntax to retrieve title and likes as one list (as every publication has both). I then use slicing to separate titles from likes.
from bs4 import BeautifulSoup as bs
import requests
import csv
def get_titles_and_likes(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
info = [item.text.strip() for item in soup.select(css_selector)]
titles = info[::2]
likes = info[1::2]
return list(zip(titles,likes))
results = []
with requests.Session() as s:
for page in range(1,10): #set number of pages
data = get_titles_and_likes(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-heading .searchpartnership-url-analytics, .dataset-heading [href*="/data/dataset"], .dataset-item #likes-count')
results.append(data)
results = [i for item in results for i in item]
with open(r'data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Likes'])
for row in results:
w.writerow(row)

In Python3, how can I use the .append function to add a string to scraped links?

Thanks to stackoverflow.com I was able write a program that scrapes web links from any given web page. However, I need it to concatenate the home URL to any relative link that it comes across. (Example: "http://www.google.com/sitemap" is okay. But just "/sitemap" by itself is not okay.)
In the following code,
from bs4 import BeautifulSoup as mySoup
from urllib.parse import urljoin as myJoin
from urllib.request import urlopen as myRequest
base_url = "https://www.census.gov/programs-surveys/popest.html"
html_page = myRequest(base_url)
raw_html = html_page.read()
page_soup = mySoup(raw_html, "html.parser")
html_page.close()
f = open("census4-3.csv", "w")
all_links = page_soup.find_all('a', href=True)
def clean_links(tags, base_url):
cleaned_links = set()
for tag in tags:
link = tag.get('href')
if link is None:
continue
full_url = myJoin(base_url, link)
cleaned_links.add(full_url)
return cleaned_links
cleaned_links = clean_links(all_links, base_url)
for link in cleaned_links:
f.write(str(link) + '\n')
f.close()
print("The CSV file is saved to your computer.")
how and where would I add something like this:
.append("http://www.google.com")
You should save your base url as base_url = 'https://www.census.gov'.
Call the requests like this
html_page = myRequest(base_url + '/programs-surveys/popest.html')
When you want to get any full_url, just do this
full_url = base_url + link

Python: Returning parsed info to a list?

My code:
from urllib2 import urlopen
from bs4 import BeautifulSoup
url = "https://realpython.com/practice/profiles.html"
html_page = urlopen(url)
html_text = html_page.read()
soup = BeautifulSoup(html_text)
links = soup.find_all('a', href = True)
files = []
def page_names():
for a in links:
files.append(a['href'])
return files
page_names()
print files[:]
base = "https://realpython.com/practice/"
print base + files[:]
I'm trying to parse out three webpage file names and append them to "files" list, then somehow append or add them to the end of the base url for a simple print.
I've tried making "base" a single item list so I could append, but I am rather new to Python and believe I'm screwing up my for statement.
Currently I get:
print files[:]
TypeError: 'type' object has no attribute '__getitem__'
At the last you have defined list[:], it's completely wrong since list is a built-in keyword for creating actual list.
from urllib2 import urlopen
from bs4 import BeautifulSoup
url = "https://realpython.com/practice/profiles.html"
html_page = urlopen(url)
html_text = html_page.read()
soup = BeautifulSoup(html_text)
links = soup.find_all('a', href = True)
files = []
def page_names():
for a in links:
files.append(a['href'])
page_names()
base = "https://realpython.com/practice/"
for i in files:
print base + i
Output:
https://realpython.com/practice/aphrodite.html
https://realpython.com/practice/poseidon.html
https://realpython.com/practice/dionysus.html
And you don't need to create intermediate list for storing links or files just use list_comprehension.
from urllib2 import urlopen
from bs4 import BeautifulSoup
url = "https://realpython.com/practice/profiles.html"
html_page = urlopen(url)
html_text = html_page.read()
soup = BeautifulSoup(html_text)
files = [i['href'] for i in soup.find_all('a', href = True)]
base = "https://realpython.com/practice/"
for i in files:
print base + i

Categories

Resources