Scrape Multiple URLs using Beautiful Soup

Scrape Multiple URLs using Beautiful Soup - python

I'm trying to extract specific classes from multiple URLs. The tags and classes stay the same but I need my python program to scrape all as I just input my link.
Here's a sample of my work:
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
url = input('insert URL here: ')
#scrape elements
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
This works for individual URLs but not for a batch. Thanks for helping me. I learned a lot from this community.

Have a list of urls and iterate through it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
If you are going to prompt user for input for each site then it can be done this way
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
msg = 'Enter Url, to exit type q and hit enter.'
url = input(msg)
while(url!='q'):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
input(msg)

If you want to scrape links in batches. Specify a batch size and iterate over it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
batch_size = 5
urllist = ["url1", "url2", "url3", .....]
url_chunks = [urllist[x:x+batch_size] for x in xrange(0, len(urllist), batch_size)]
def scrape_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
h1 = soup.find("h1", class_= "class-headline")
return (h1.get_text())
def scrape_batch(url_chunk):
chunk_resp = []
for url in url_chunk:
chunk_resp.append(scrape_url(url))
return chunk_resp
for url_chunk in url_chunks:
print scrape_batch(url_chunk)

Related

How to return only hotel description from booking?

How to print only description from specific hotel from booking.com
Ex:
I use this code but return None!
import requests
from scrapy.selector import Selector
url = 'https://www.booking.com/hotel/eg/mediterranean-azur.html?'
response2 = requests.get(url)
if response2.ok is True:
selector = Selector(text=response2.content)
print(selector.xpath("//div[#class='hp__hotel-name']").get())
Any kind of help please?

Try this:
import requests
from bs4 import BeautifulSoup
url = "https://www.booking.com/hotel/eg/mediterranean-azur.uk.html?"
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")
answ = soup.find("div", {"id":"property_description_content"}).text

Can't scrape all of ul tags from a table

I'm trying to scrape all of proxy ips from this site : https://proxy-list.org/english/index.php but i can only get one ip at most
here is my code :
from helium import *
from bs4 import BeautifulSoup
url = 'https://proxy-list.org/english/index.php'
browser = start_chrome(url, headless=True)
soup = BeautifulSoup(browser.page_source, 'html.parser')
proxies = soup.find_all('div', {'class':'table'})
for ips in proxies:
print(ips.find('li', {'class':'proxy'}).text)
i tried to use ips.find_all but it didn't work.

from bs4 import BeautifulSoup
import requests
url = 'https://proxy-list.org/english/index.php'
pagecontent = requests.get(url)
soup = BeautifulSoup(browser.pagecontent, 'html.parser')
maintable = soup.find_all('div', {'class':'table'})
for div_element in maintable:
rows = div_element.find_all('li', class_='proxy')
for ip in rows:
print(ip.text)

If I get your question right, the following is one of the ways how you can fetch those proxies using requests module and Beautifulsoup library:
import re
import base64
import requests
from bs4 import BeautifulSoup
url = 'https://proxy-list.org/english/index.php'
def decode_proxy(target_str):
converted_proxy = base64.b64decode(target_str)
return converted_proxy.decode()
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
for tr in soup.select("#proxy-table li.proxy > script"):
proxy_id = re.findall(r"Proxy[^']+(.*)\'",tr.contents[0])[0]
print(decode_proxy(proxy_id))
First few results:
62.80.180.111:8080
68.183.221.156:38159
189.201.134.13:8080
178.60.201.44:8080
128.199.79.15:8080
139.59.78.193:8080
103.148.216.5:80

Beautifulsoup findAll returns an empty list

I'm trying to scrape a webpage using beautifulsoup, but findAll() returns an empty list. This is my code:
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html5lib')
recordList = bsObj.findAll('a', attrs = {'class':"lazy-loaded "})
print(recordList)
What am I doing wrong?

you need to find img tags with a class lazyloaded
import requests
from bs4 import BeautifulSoup
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html')
recordList = bsObj.findAll('img',class_="lazy-loaded")
recordList =[i['data-src'] for i in recordList ]
print(recordList)
Output:
['https://media.elcinema.com/blank_photos/75x75.jpg', 'https://media.elcinema.com/uploads/_75x75_2fe90cb32f2759181f71eb2a9b29f0735f87ac88150a6a8fd3734300f8714369.jpg', 'https://media.elcinema.com/uploads/_75x75_3d90d1ee22c5f455bc4556073eab69cd218446d6134dc0f2694782ee39ccb5bf.jpg', 'https://media.elcinema.com/uploads/_75x75_81f30061ed82645e9ee688642275d76a23ee329344c5ac25c42f22afa35432ff.jpg', 'https://media.elcinema.com/blank_photos/75x75.jpg',.......]

According to the question, it looks like you need to find all a records who have img tag in it with a specific class lazy-loaded Follow the below code to get those:
Code:
import requests
from bs4 import BeautifulSoup
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html.parser')
outputdata=[]
recordList = bsObj.findAll('a')
for record in recordList:
if record.find("img",{"class":"lazy-loaded"}):
outputdata.append(record)
print(len(outputdata))
print(outputdata)
Output:
Let me know if you have any questions :)

Beautiful Soup to extract Chemical Names

I am trying to extract the chemical names (All in CAPS lock) from the below URL.
https://www.legislation.gov.au/Details/F2020L01255
I am interested in the chemicals that is shown in Schedule 4.
import requests
import re
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = 'https://www.legislation.gov.au/Details/F2020L01255'
headers = {"Accept-Language": "EN-AU, en;q=0.5"}
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
chemicals = []
chems_div = soup.find_all('div', class_='WordSection7')
I am stuck from here. The chemical names is wrapped around a P tag with class='MsoNormal' and Span tag with lang='EN-AU'.

Try this:
import requests
from bs4 import BeautifulSoup
url = 'https://www.legislation.gov.au/Details/F2020L01255'
headers = {"Accept-Language": "EN-AU, en;q=0.5"}
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
chems_div = soup.find('div', class_='WordSection7')
all_spans = [
t.getText(strip=True) for t in
chems_div.find_all("span", {"lang": "EN-AU"})
]
print([w for w in all_spans if w.isupper() and w != "SCHEDULE 4"])
Output:
['ABACAVIR.', 'ABATACEPT.', 'ABIRATERONE ACETATE.', 'ABCIXIMAB.', 'ABEMACICLIB.', 'ACALABRUTINIB.', 'ACAMPROSATE CALCIUM.', 'ACARBOSE.', 'ACEBUTOLOL.', 'ACEPROMAZINE.', 'ACETARSOL.', 'ACETAZOLAMIDE.', 'ACETOHEXAMIDE.', 'ACETYL ISOVALERYLTYLOSIN.', 'ACETYLCARBROMAL.', 'ACETYLCHOLINE.', 'ACETYLDIGITOXIN.', 'ACETYLMETHYLDIMETHYLOXIMIDOPHENYLHYDRAZINE.', 'ACETYLSTROPHANTHIDIN.', 'ACIPIMOX.', '# ACITRETIN.', 'ACLIDINIUM BROMIDE.', 'ACOKANTHERA OUABAIO.', 'ACOKANTHERA SCHIMPERI.', 'ACRIVASTINE.', 'ADALIMUMAB.', 'ADAPALENE.', 'ADEFOVIR.', 'ADIPHENINE.', 'ADONIS VERNALIS.', 'ADRAFINIL.', 'AFAMELANOTIDE.', 'AFATINIB DIMALEATE.'
and so on...

How to get the data from <script> with no attributes inside <body>?

I am trying to extract all coordinates of locations of the restaurant using beautiful soup.How can i extract all of them from the script tag under the body?
from bs4 import BeautifulSoup as bs
import requests
import urllib2
import json
base_url = 'https://locations.wafflehouse.com/'
r = requests.get(base_url)
soup = bs(r.text, 'html.parser')
all_scripts = soup.find_all('script')
print all_scripts[19]

UPDATED ANSWER:
You need to parse the json in json.loads() and then navigate, try this code...working smoothly!
import json, requests
from bs4 import BeautifulSoup
req = requests.get('https://locations.wafflehouse.com/')
soup = BeautifulSoup(req.content, 'html.parser')
data = soup.find_all('script')[19].text.encode('utf-8')
jdata = data.replace('window.__SLS_REDUX_STATE__ =', '').replace(';', '')
data = json.loads(jdata)
for i in data['dataLocations']['collection']['features']:
LatLong = (i['geometry']['coordinates'])
print(LatLong)
Output:
[-90.073113, 30.37019]
[-84.131085, 33.952944]
[-78.719497, 36.14261]
[-95.629084, 29.947421]
[-83.9019, 33.56531]
[-80.091552, 37.288422]
[-77.949231, 34.237534]
[-96.60637, 32.968131]
[-80.969088, 29.151235]
[-86.843386, 33.354666]
[-84.206, 33.462175]
[-76.342464, 36.830187]
[-79.985822, 32.898412]
[-84.2784568595722, 33.
[-88.780694, 35.674914]
[-87.898899, 30.598605]
[-83.71487, 32.614092]
[-79.523611, 36.07101]
[-91.127792, 30.580582]
[-86.352681, 35.875097]
[-90.271372, 30.023002]
[-80.205641, 25.955672]
[-81.632, 30.157]
[-86.961821, 31.454352]
[-80.666906, 35.366769]
[-97.56596, 35.406447]
[-84.364334, 35.511474]
[-81.01622, 29.23453]
[-86.57177, 34.855504]
[-84.625908, 33.399829]
[-76.344303, 36.740862]
[-84.192634, 33.517948]
[-77.83421, 39.296024]
[-77.518985, 38.359332]
[-84.45238, 38.042061]
[-83.08319, 39.840191]
[-81.993971, 33.475816]
[-95.481102, 29.913294]
[-82.699, 28.334]
[-84.352035, 33.989889]
[-86.819468, 35.945115]
[-91.009638, 30.407864]
[-81.8428, 27.9096]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrape Multiple URLs using Beautiful Soup - python

Related

How to return only hotel description from booking?

Can't scrape all of ul tags from a table

Beautifulsoup findAll returns an empty list

Beautiful Soup to extract Chemical Names

How to get the data from <script> with no attributes inside <body>?

Categories

Resources