Having trouble extracting the URL from a website

Having trouble extracting the URL from a website - python

So i want to extract url for all the buttons on the sidebar, but I can't seem to get past the first one, and I dont know why or how to fix it. Unfortunately, this is for an assignment so I cant import anything else.
This is the code I tried
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://books.toscrape.com/"
genres = ["Travel", "Mystery", "Historical Fiction", "Sequential Art", "Classics", "Philosophy"]
# write your code below
response=requests.get(url, timeout=3)
soup = BeautifulSoup(response.content, 'html.parser')
sidebar=soup.find_all('div',{'class':'side_categories'})
for a in sidebar:
genre_url=a.find('a').get('href')
print(genre_url)
I got
catalogue/category/books_1/index.html
I was expecting
catalogue/category/books_1/index.html
catalogue/category/books/travel_2/index.html
catalogue/category/books/mystery_3/index.html
catalogue/category/books/historical-fiction_4/index.html
catalogue/category/books/sequential-art_5/index.html
catalogue/category/books/classics_6/index.html
...

I used the following CSS selector to find all the tags from the sidebar: .side_categories>ul>li>ul>li>a
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://books.toscrape.com/"
genres = ["Travel", "Mystery", "Historical Fiction", "Sequential Art", "Classics", "Philosophy"]
# write your code below
response=requests.get(url, timeout=3)
soup = BeautifulSoup(response.content, 'html.parser')
genre_url_elems = soup.select(".side_categories>ul>li>ul>li>a")
genre_urls = [e['href'] for e in genre_url_elems]
for url in genre_urls:
print(url)
Here's the output:
catalogue/category/books/travel_2/index.html
catalogue/category/books/mystery_3/index.html
catalogue/category/books/historical-fiction_4/index.html
catalogue/category/books/sequential-art_5/index.html
catalogue/category/books/classics_6/index.html
catalogue/category/books/philosophy_7/index.html
catalogue/category/books/romance_8/index.html
catalogue/category/books/womens-fiction_9/index.html
catalogue/category/books/fiction_10/index.html
catalogue/category/books/childrens_11/index.html
catalogue/category/books/religion_12/index.html
catalogue/category/books/nonfiction_13/index.html
catalogue/category/books/music_14/index.html
catalogue/category/books/default_15/index.html
catalogue/category/books/science-fiction_16/index.html
catalogue/category/books/sports-and-games_17/index.html
catalogue/category/books/add-a-comment_18/index.html
catalogue/category/books/fantasy_19/index.html
catalogue/category/books/new-adult_20/index.html
catalogue/category/books/young-adult_21/index.html
catalogue/category/books/science_22/index.html
catalogue/category/books/poetry_23/index.html
catalogue/category/books/paranormal_24/index.html
catalogue/category/books/art_25/index.html
catalogue/category/books/psychology_26/index.html
catalogue/category/books/autobiography_27/index.html
catalogue/category/books/parenting_28/index.html
catalogue/category/books/adult-fiction_29/index.html
catalogue/category/books/humor_30/index.html
catalogue/category/books/horror_31/index.html
catalogue/category/books/history_32/index.html
catalogue/category/books/food-and-drink_33/index.html
catalogue/category/books/christian-fiction_34/index.html
catalogue/category/books/business_35/index.html
catalogue/category/books/biography_36/index.html
catalogue/category/books/thriller_37/index.html
catalogue/category/books/contemporary_38/index.html
catalogue/category/books/spirituality_39/index.html
catalogue/category/books/academic_40/index.html
catalogue/category/books/self-help_41/index.html
catalogue/category/books/historical_42/index.html
catalogue/category/books/christian_43/index.html
catalogue/category/books/suspense_44/index.html
catalogue/category/books/short-stories_45/index.html
catalogue/category/books/novels_46/index.html
catalogue/category/books/health_47/index.html
catalogue/category/books/politics_48/index.html
catalogue/category/books/cultural_49/index.html
catalogue/category/books/erotica_50/index.html
catalogue/category/books/crime_51/index.html
For more, read about 'CSS selectors': https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors

Here you go:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://books.toscrape.com/"
genres = ["Travel", "Mystery", "Historical Fiction", "Sequential Art", "Classics", "Philosophy"]
# write your code below
response=requests.get(url, timeout=3)
soup = BeautifulSoup(response.content, 'html.parser')
# sidebar=soup.find_all('div',{'class':'side_categories'})
sidebar=soup.find_all('a',href=True)
for link in sidebar:
url = link['href']
if 'catalogue' in url:
print(url)

Related

How to scrape review to dataframe

I would like to scratch the reviews from this page and save them as a data frame, but I do not download star ratings and the text of the review. Just only text. What i did wrong?
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.morele.net/pralka-candy-cs4-1062d3-950636/?sekcja=reviews-all")
soup = BeautifulSoup(page.content, "html.parser",
).find_all("div", {"class":"reviews-item"})
# print(soup)
morele = [div.getText(strip=True) for div in soup]
print(morele)
csv_table = pd.DataFrame(morele)
csv_table = csv_table.reset_index(drop=True)
csv_table.insert(0,'No.',csv_table.index)

You are mostly there - just further navigate the DOM and you can get just the text.
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.morele.net/pralka-candy-cs4-1062d3-950636/?sekcja=reviews-all")
soup = BeautifulSoup(page.content, "html.parser",)
data = [{"text":ri.find("div", {"class":"rev-desc"}).getText(strip=True) ,
"stars":ri.find("div", {"class":"rev-stars"}).getText(strip=True)}
for ri in soup.find_all("div", {"class":"reviews-item"})
]
pd.DataFrame(data)

Get the descriptions of product details in the script by BeautifulSoup and json

I'm getting descriptions of product details in the tag from the web.
Here's the code :
import re
import json
from bs4 import BeautifulSoup
import requests
url = 'https://oldnavy.gap.com/browse/product.do?pid=599211032&rrec=true&mlink=5050,12413545,onproduct1_rr_3&clink=12413545#pdp-page-content'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
results = soup.find('script', id="pdpData").string
jsData = re.search(r'window.__ PRODUCT_PAGE_STATE __\s+=\s+', results)
data = json.loads(jsData.group(0))
and the part xxxxx I want is below in the script
window. __PRODUCT_PAGE_STATE __ = JSON.parse(xxxxx)
Through re.search to find window. __PRODUCT_PAGE_STATE __ , I still can not reach the part xxxxx.
Is there any other way to extract the info in the part xxxxx?

Try this:
import re
import json
from bs4 import BeautifulSoup
import requests
url = 'https://oldnavy.gap.com/browse/product.do?pid=599211032&rrec=true&mlink' \
'=5050,12413545,onproduct1_rr_3&clink=12413545#pdp-page-content'
soup = BeautifulSoup(
requests.get(url).content,
"html.parser",
).find('script', id="pdpData")
the_xxx_part = json.loads(
re.search(r"\.parse\((.+)\);", soup.string, re.S).group(1).strip(),
)
print(json.loads(the_xxx_part)["productData"]["name"])
Output:
Unisex Faux-Fur-Trim Hooded Frost-Free Puffer Jacket for Toddler
To print the entire JOSN object edit this:
print(json.loads(the_xxx_part)["productData"]["name"])
into this:
print(json.loads(the_xxx_part))

Python html parsing using beautiful soup issues

I am trying to get the name of all organizations from https://www.devex.com/organizations/search using beautifulsoup.However, I am getting an error. Can someone please help.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
headers = {"Accept-Language": "en-US,en;q=0.5"}
titles = []
pages = np.arange(1, 2, 1)
for page in pages:
page = requests.get("https://www.devex.com/organizations/search?page%5Bnumber%5D=" + str(page) + "", headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
movie_div = soup.find_all('div', class_='info-container')
sleep(randint(2,10))
for container in movie_div:
name = container.a.find('h3', class_= 'ng-binding').text
titles.append(name)
movies = pd.DataFrame({
'movie': titles,
})
to see your dataframe
print(movies)
to see the datatypes of your columns
print(movies.dtypes)
to see where you're missing data and how much data is missing
print(movies.isnull().sum())
to move all your scraped data to a CSV file
movies.to_csv('movies.csv')

you may try with something like
name = bs.find("h3", {"class": "ng-binding"})

Scraping with beautifulsoup trying to get all the href attributes

Im trying to scrape all the urls from amazon categories website (https://www.amazon.com/gp/site-directory/ref=nav_shopall_btn)
but I can just get the first url of any category. For example, from "Amazon video" I am getting "All videos", "Fire TV" amazon fire tv, etc.
That is my code:
from bs4 import BeautifulSoup
import requests
url = "https://www.amazon.es/gp/site-directory/ref=nav_shopall_btn"
amazon_link = requests.get(url)
html = BeautifulSoup(amazon_link.text,"html.parser")
categorias_amazon = html.find_all('div',{'class':'popover-grouping'})
for i in range(len(categorias_amazon)):
print("www.amazon.es" + categorias_amazon[i].a['href'])
I have tried with:
print("www.amazon.es" + categorias_amazon[i].find_all['a'])
but I get an error. I am looking to get href attribute of every sub category.

You can try this code:
from bs4 import BeautifulSoup
import requests
url = "https://www.amazon.es/gp/site-directory/ref=nav_shopall_btn"
amazon_link = requests.get(url)
html = BeautifulSoup(amazon_link.text,"html.parser")
# print html
categorias_amazon = html.find_all('div',{'class':'popover-grouping'})
allurls=html.select("div.popover-grouping [href]")
values=[link['href'].strip() for link in allurls]
for value in values:
print("www.amazon.es" + value)
It will print:
www.amazon.es/b?ie=UTF8&node=1748200031
www.amazon.es/gp/dmusic/mp3/player
www.amazon.es/b?ie=UTF8&node=2133385031
www.amazon.es/clouddrive/primephotos
www.amazon.es/clouddrive/home
www.amazon.es/clouddrive/home#download-section
www.amazon.es/clouddrive?_encoding=UTF8&sf=1
www.amazon.es/dp/B0186FET66
www.amazon.es/dp/B00QJDO0QC
www.amazon.es/dp/B00IOY524S
www.amazon.es/dp/B010EK1GOE
www.amazon.es/b?ie=UTF8&node=827234031
www.amazon.es/ebooks-kindle/b?ie=UTF8&node=827231031
www.amazon.es/gp/kindle/ku/sign-up/
www.amazon.es/b?ie=UTF8&node=8504981031
www.amazon.es/gp/digital/fiona/kcp-landing-page
www.amazon.eshttps://www.amazon.es:443/gp/redirect.html?location=https://leer.amazon.es/&token=CA091C61DBBA8A5C0F6E4A46ED30C059164DBC74&source=standards
www.amazon.es/gp/digital/fiona/manage
www.amazon.es/dp/B00ZDWLEEG
www.amazon.es/dp/B00IRKMZX0
www.amazon.es/dp/B01AHBC23E
www.amazon.es/b?ie=UTF8&node=827234031
www.amazon.es/mobile-apps/b?ie=UTF8&node=1661649031
www.amazon.es/b?ie=UTF8&node=1726755031
www.amazon.es/b?ie=UTF8&node=1748200031
www.amazon.es/ebooks-kindle/b?ie=UTF8&node=827231031
www.amazon.es/gp/digital/fiona/manage
www.amazon.es/b?ie=UTF8&node=10909716031
www.amazon.es/b?ie=UTF8&node=10909718031
www.amazon.es/b?ie=UTF8&node=10909719031
www.amazon.es/b?ie=UTF8&node=10909720031
www.amazon.es/b?ie=UTF8&node=10909721031
www.amazon.es/b?ie=UTF8&node=10909722031
www.amazon.es/b?ie=UTF8&node=8464150031
www.amazon.es/mobile-apps/b?ie=UTF8&node=1661649031
www.amazon.es/b?ie=UTF8&node=1726755031
www.amazon.es/b?ie=UTF8&node=4622953031
www.amazon.es/gp/feature.html?ie=UTF8&docId=1000658923
www.amazon.es/gp/mas/your-account/myapps
www.amazon.es/comprar-libros-espa%C3%B1ol/b?ie=UTF8&node=599364031
www.amazon.es/ebooks-kindle/b?ie=UTF8&node=827231031
www.amazon.es/gp/kindle/ku/sign-up/
www.amazon.es/Libros-en-ingl%C3%A9s/b?ie=UTF8&node=665418031
www.amazon.es/Libros-en-otros-idiomas/b?ie=UTF8&node=599367031
www.amazon.es/b?ie=UTF8&node=902621031
www.amazon.es/libros-texto/b?ie=UTF8&node=902673031
www.amazon.es/Blu-ray-DVD-peliculas-series-3D/b?ie=UTF8&node=599379031
www.amazon.es/series-tv-television-DVD-Blu-ray/b?ie=UTF8&node=665293031
www.amazon.es/Blu-ray-peliculas-series-3D/b?ie=UTF8&node=665303031
www.amazon.es/M%C3%BAsica/b?ie=UTF8&node=599373031
www.amazon.es/b?ie=UTF8&node=1748200031
www.amazon.es/musical-instruments/b?ie=UTF8&node=3628866031
www.amazon.es/fotografia-videocamaras/b?ie=UTF8&node=664660031
www.amazon.es/b?ie=UTF8&node=931491031
www.amazon.es/tv-video-home-cinema/b?ie=UTF8&node=664659031
www.amazon.es/b?ie=UTF8&node=664684031
www.amazon.es/gps-accesorios/b?ie=UTF8&node=664661031
www.amazon.es/musical-instruments/b?ie=UTF8&node=3628866031
www.amazon.es/accesorios/b?ie=UTF8&node=928455031
www.amazon.es/Inform%C3%A1tica/b?ie=UTF8&node=667049031
www.amazon.es/Electr%C3%B3nica/b?ie=UTF8&node=599370031
www.amazon.es/portatiles/b?ie=UTF8&node=938008031
www.amazon.es/tablets/b?ie=UTF8&node=938010031
www.amazon.es/ordenadores-sobremesa/b?ie=UTF8&node=937994031
www.amazon.es/componentes/b?ie=UTF8&node=937912031
www.amazon.es/b?ie=UTF8&node=2457643031
www.amazon.es/b?ie=UTF8&node=2457641031
www.amazon.es/Software/b?ie=UTF8&node=599376031
www.amazon.es/pc-videojuegos-accesorios-mac/b?ie=UTF8&node=665498031
www.amazon.es/Inform%C3%A1tica/b?ie=UTF8&node=667049031
www.amazon.es/material-oficina/b?ie=UTF8&node=4352791031
www.amazon.es/productos-papel-oficina/b?ie=UTF8&node=4352794031
www.amazon.es/boligrafos-lapices-utiles-escritura/b?ie=UTF8&node=4352788031
www.amazon.es/electronica-oficina/b?ie=UTF8&node=4352790031
www.amazon.es/oficina-papeleria/b?ie=UTF8&node=3628728031
www.amazon.es/videojuegos-accesorios-consolas/b?ie=UTF8&node=599382031
www.amazon.es/b?ie=UTF8&node=665290031
www.amazon.es/pc-videojuegos-accesorios-mac/b?ie=UTF8&node=665498031
www.amazon.es/b?ie=UTF8&node=8490963031
www.amazon.es/b?ie=UTF8&node=1381541031
www.amazon.es/Juguetes-y-juegos/b?ie=UTF8&node=599385031
www.amazon.es/bebe/b?ie=UTF8&node=1703495031
www.amazon.es/baby-reg/homepage
www.amazon.es/gp/family/signup
www.amazon.es/b?ie=UTF8&node=2181872031
www.amazon.es/b?ie=UTF8&node=3365351031
www.amazon.es/bano/b?ie=UTF8&node=3244779031
www.amazon.es/b?ie=UTF8&node=1354952031
www.amazon.es/iluminacion/b?ie=UTF8&node=3564289031
www.amazon.es/pequeno-electrodomestico/b?ie=UTF8&node=2165363031
www.amazon.es/aspiracion-limpieza-planchado/b?ie=UTF8&node=2165650031
www.amazon.es/almacenamiento-organizacion/b?ie=UTF8&node=3359926031
www.amazon.es/climatizacion-calefaccion/b?ie=UTF8&node=3605952031
www.amazon.es/Hogar/b?ie=UTF8&node=599391031
www.amazon.es/herramientas-electricas-mano/b?ie=UTF8&node=3049288031
www.amazon.es/Cortacespedes-Tractores-Jardineria/b?ie=UTF8&node=3249445031
www.amazon.es/instalacion-electrica/b?ie=UTF8&node=3049284031
www.amazon.es/accesorios-cocina-bano/b?ie=UTF8&node=3049286031
www.amazon.es/seguridad/b?ie=UTF8&node=3049292031
www.amazon.es/Bricolaje-Herramientas-Fontaneria-Ferreteria-Jardineria/b?ie=UTF8&node=2454133031
www.amazon.es/Categorias/b?ie=UTF8&node=6198073031
www.amazon.es/b?ie=UTF8&node=6348071031
www.amazon.es/Categorias/b?ie=UTF8&node=6198055031
www.amazon.es/b?ie=UTF8&node=12300685031
www.amazon.es/Salud-y-cuidado-personal/b?ie=UTF8&node=3677430031
www.amazon.es/Suscribete-Ahorra/b?ie=UTF8&node=9699700031
www.amazon.es/Amazon-Pantry/b?ie=UTF8&node=10547412031
www.amazon.es/moda-mujer/b?ie=UTF8&node=5517558031
www.amazon.es/moda-hombre/b?ie=UTF8&node=5517557031
www.amazon.es/moda-infantil/b?ie=UTF8&node=5518995031
www.amazon.es/bolsos-mujer/b?ie=UTF8&node=2007973031
www.amazon.es/joyeria/b?ie=UTF8&node=2454126031
www.amazon.es/relojes/b?ie=UTF8&node=599388031
www.amazon.es/equipaje/b?ie=UTF8&node=2454129031
www.amazon.es/gp/feature.html?ie=UTF8&docId=12464607031
www.amazon.es/b?ie=UTF8&node=8520792031
www.amazon.es/running/b?ie=UTF8&node=2928523031
www.amazon.es/fitness-ejercicio/b?ie=UTF8&node=2928495031
www.amazon.es/ciclismo/b?ie=UTF8&node=2928487031
www.amazon.es/tenis-padel/b?ie=UTF8&node=2985165031
www.amazon.es/golf/b?ie=UTF8&node=2928503031
www.amazon.es/deportes-equipo/b?ie=UTF8&node=2975183031
www.amazon.es/deportes-acuaticos/b?ie=UTF8&node=2928491031
www.amazon.es/deportes-invierno/b?ie=UTF8&node=2928493031
www.amazon.es/Tiendas-campa%C3%B1a-Sacos-dormir-Camping/b?ie=UTF8&node=2928471031
www.amazon.es/deportes-aire-libre/b?ie=UTF8&node=2454136031
www.amazon.es/ropa-calzado-deportivo/b?ie=UTF8&node=2975170031
www.amazon.es/calzado-deportivo/b?ie=UTF8&node=2928484031
www.amazon.es/electronica-dispositivos-el-deporte/b?ie=UTF8&node=2928496031
www.amazon.es/Coche-y-moto/b?ie=UTF8&node=1951051031
www.amazon.es/b?ie=UTF8&node=2566955031
www.amazon.es/gps-accesorios/b?ie=UTF8&node=664661031
www.amazon.es/Motos-accesorios-piezas/b?ie=UTF8&node=2425161031
www.amazon.es/industrial-cientfica/b?ie=UTF8&node=5866088031
www.amazon.es/b?ie=UTF8&node=6684191031
www.amazon.es/b?ie=UTF8&node=6684193031
www.amazon.es/b?ie=UTF8&node=6684192031
www.amazon.es/handmade/b?ie=UTF8&node=9699482031
www.amazon.es/b?ie=UTF8&node=10740508031
www.amazon.es/b?ie=UTF8&node=10740511031
www.amazon.es/b?ie=UTF8&node=10740559031
www.amazon.es/b?ie=UTF8&node=10740502031
www.amazon.es/b?ie=UTF8&node=10740505031
Hope this is what you were looking for.

Do you want to scrapp it or scrape it? If it's the latter, that about this?
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen("https://www.amazon.es/gp/site-directory/ref=nav_shopall_btn")
soup = BeautifulSoup(html_page)
for link in soup.findAll('a'):
print link.get('href')

Scrape Multiple URLs using Beautiful Soup

I'm trying to extract specific classes from multiple URLs. The tags and classes stay the same but I need my python program to scrape all as I just input my link.
Here's a sample of my work:
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
url = input('insert URL here: ')
#scrape elements
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
This works for individual URLs but not for a batch. Thanks for helping me. I learned a lot from this community.

Have a list of urls and iterate through it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
If you are going to prompt user for input for each site then it can be done this way
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
msg = 'Enter Url, to exit type q and hit enter.'
url = input(msg)
while(url!='q'):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
input(msg)

If you want to scrape links in batches. Specify a batch size and iterate over it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
batch_size = 5
urllist = ["url1", "url2", "url3", .....]
url_chunks = [urllist[x:x+batch_size] for x in xrange(0, len(urllist), batch_size)]
def scrape_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
h1 = soup.find("h1", class_= "class-headline")
return (h1.get_text())
def scrape_batch(url_chunk):
chunk_resp = []
for url in url_chunk:
chunk_resp.append(scrape_url(url))
return chunk_resp
for url_chunk in url_chunks:
print scrape_batch(url_chunk)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Having trouble extracting the URL from a website - python

Related

How to scrape review to dataframe

Get the descriptions of product details in the script by BeautifulSoup and json

Python html parsing using beautiful soup issues

Scraping with beautifulsoup trying to get all the href attributes

Scrape Multiple URLs using Beautiful Soup

Categories

Resources