How to get the data from <script> with no attributes inside <body>? - python

I am trying to extract all coordinates of locations of the restaurant using beautiful soup.How can i extract all of them from the script tag under the body?
from bs4 import BeautifulSoup as bs
import requests
import urllib2
import json
base_url = 'https://locations.wafflehouse.com/'
r = requests.get(base_url)
soup = bs(r.text, 'html.parser')
all_scripts = soup.find_all('script')
print all_scripts[19]

UPDATED ANSWER:
You need to parse the json in json.loads() and then navigate, try this code...working smoothly!
import json, requests
from bs4 import BeautifulSoup
req = requests.get('https://locations.wafflehouse.com/')
soup = BeautifulSoup(req.content, 'html.parser')
data = soup.find_all('script')[19].text.encode('utf-8')
jdata = data.replace('window.__SLS_REDUX_STATE__ =', '').replace(';', '')
data = json.loads(jdata)
for i in data['dataLocations']['collection']['features']:
LatLong = (i['geometry']['coordinates'])
print(LatLong)
Output:
[-90.073113, 30.37019]
[-84.131085, 33.952944]
[-78.719497, 36.14261]
[-95.629084, 29.947421]
[-83.9019, 33.56531]
[-80.091552, 37.288422]
[-77.949231, 34.237534]
[-96.60637, 32.968131]
[-80.969088, 29.151235]
[-86.843386, 33.354666]
[-84.206, 33.462175]
[-76.342464, 36.830187]
[-79.985822, 32.898412]
[-84.2784568595722, 33.
[-88.780694, 35.674914]
[-87.898899, 30.598605]
[-83.71487, 32.614092]
[-79.523611, 36.07101]
[-91.127792, 30.580582]
[-86.352681, 35.875097]
[-90.271372, 30.023002]
[-80.205641, 25.955672]
[-81.632, 30.157]
[-86.961821, 31.454352]
[-80.666906, 35.366769]
[-97.56596, 35.406447]
[-84.364334, 35.511474]
[-81.01622, 29.23453]
[-86.57177, 34.855504]
[-84.625908, 33.399829]
[-76.344303, 36.740862]
[-84.192634, 33.517948]
[-77.83421, 39.296024]
[-77.518985, 38.359332]
[-84.45238, 38.042061]
[-83.08319, 39.840191]
[-81.993971, 33.475816]
[-95.481102, 29.913294]
[-82.699, 28.334]
[-84.352035, 33.989889]
[-86.819468, 35.945115]
[-91.009638, 30.407864]
[-81.8428, 27.9096]

Related

Get data from bs4 script

I am new to programming and i am trying to parse this page: https://ruz.spbstu.ru/faculty/100/groups
url = "https://ruz.spbstu.ru/faculty/100/groups"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
scripts = soup.find_all('script')
print(scripts[3].text)
this gives me
window.__INITIAL_STATE__ = {"faculties":{"isFetching":false,"data":null,"errors":null},"groups":{"isFetching":false,"data":{"100":[{"id":35754,"name":"3733806/00301","level":3,"type":"common","kind":0,"spec":"38.03.06 Торговое дело","year":2022},{"id":35715,"name":"3763801/10103","level":2,"type":"common","kind":3,"spec":"38.06.01 Экономика","year":2022},{"id":34725,"name":"з3753801/80430_2021","level":5,"type":"distance","kind":2,"spec":"38.05.01 Экономическая безопасность","year":2022},{"id":33632,"name":"3733801/10002_2021","level":2,"type":"common","kind":0,"spec":"38.03.01 Экономика","year":2022}...........
contents are very long so this is an extract from the output.
i need get all 'id's and 'name's from this output and put them into the dictionary like {id:name}, i can't figure out a way how to do it.
Any information will be very helpful.
Try:
import re
import json
import requests
from bs4 import BeautifulSoup
url = "https://ruz.spbstu.ru/faculty/100/groups"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
scripts = soup.find_all("script")
data = re.search(r".*?({.*});", scripts[3].text).group(1)
data = json.loads(data)
out = {d["id"]: d["name"] for d in data["groups"]["data"]["100"]}
print(out)
Prints:
{35754: '3733806/00301', 35715: '3763801/10103', ...etc.

Can't scrape all of ul tags from a table

I'm trying to scrape all of proxy ips from this site : https://proxy-list.org/english/index.php but i can only get one ip at most
here is my code :
from helium import *
from bs4 import BeautifulSoup
url = 'https://proxy-list.org/english/index.php'
browser = start_chrome(url, headless=True)
soup = BeautifulSoup(browser.page_source, 'html.parser')
proxies = soup.find_all('div', {'class':'table'})
for ips in proxies:
print(ips.find('li', {'class':'proxy'}).text)
i tried to use ips.find_all but it didn't work.
from bs4 import BeautifulSoup
import requests
url = 'https://proxy-list.org/english/index.php'
pagecontent = requests.get(url)
soup = BeautifulSoup(browser.pagecontent, 'html.parser')
maintable = soup.find_all('div', {'class':'table'})
for div_element in maintable:
rows = div_element.find_all('li', class_='proxy')
for ip in rows:
print(ip.text)
If I get your question right, the following is one of the ways how you can fetch those proxies using requests module and Beautifulsoup library:
import re
import base64
import requests
from bs4 import BeautifulSoup
url = 'https://proxy-list.org/english/index.php'
def decode_proxy(target_str):
converted_proxy = base64.b64decode(target_str)
return converted_proxy.decode()
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
for tr in soup.select("#proxy-table li.proxy > script"):
proxy_id = re.findall(r"Proxy[^']+(.*)\'",tr.contents[0])[0]
print(decode_proxy(proxy_id))
First few results:
62.80.180.111:8080
68.183.221.156:38159
189.201.134.13:8080
178.60.201.44:8080
128.199.79.15:8080
139.59.78.193:8080
103.148.216.5:80

Get the descriptions of product details in the script by BeautifulSoup and json

I'm getting descriptions of product details in the tag from the web.
Here's the code :
import re
import json
from bs4 import BeautifulSoup
import requests
url = 'https://oldnavy.gap.com/browse/product.do?pid=599211032&rrec=true&mlink=5050,12413545,onproduct1_rr_3&clink=12413545#pdp-page-content'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
results = soup.find('script', id="pdpData").string
jsData = re.search(r'window.__ PRODUCT_PAGE_STATE __\s+=\s+', results)
data = json.loads(jsData.group(0))
and the part xxxxx I want is below in the script
window. __PRODUCT_PAGE_STATE __ = JSON.parse(xxxxx)
Through re.search to find window. __PRODUCT_PAGE_STATE __ , I still can not reach the part xxxxx.
Is there any other way to extract the info in the part xxxxx?
Try this:
import re
import json
from bs4 import BeautifulSoup
import requests
url = 'https://oldnavy.gap.com/browse/product.do?pid=599211032&rrec=true&mlink' \
'=5050,12413545,onproduct1_rr_3&clink=12413545#pdp-page-content'
soup = BeautifulSoup(
requests.get(url).content,
"html.parser",
).find('script', id="pdpData")
the_xxx_part = json.loads(
re.search(r"\.parse\((.+)\);", soup.string, re.S).group(1).strip(),
)
print(json.loads(the_xxx_part)["productData"]["name"])
Output:
Unisex Faux-Fur-Trim Hooded Frost-Free Puffer Jacket for Toddler
To print the entire JOSN object edit this:
print(json.loads(the_xxx_part)["productData"]["name"])
into this:
print(json.loads(the_xxx_part))

Beautiful Soup to extract Chemical Names

I am trying to extract the chemical names (All in CAPS lock) from the below URL.
https://www.legislation.gov.au/Details/F2020L01255
I am interested in the chemicals that is shown in Schedule 4.
import requests
import re
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = 'https://www.legislation.gov.au/Details/F2020L01255'
headers = {"Accept-Language": "EN-AU, en;q=0.5"}
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
chemicals = []
chems_div = soup.find_all('div', class_='WordSection7')
I am stuck from here. The chemical names is wrapped around a P tag with class='MsoNormal' and Span tag with lang='EN-AU'.
Try this:
import requests
from bs4 import BeautifulSoup
url = 'https://www.legislation.gov.au/Details/F2020L01255'
headers = {"Accept-Language": "EN-AU, en;q=0.5"}
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
chems_div = soup.find('div', class_='WordSection7')
all_spans = [
t.getText(strip=True) for t in
chems_div.find_all("span", {"lang": "EN-AU"})
]
print([w for w in all_spans if w.isupper() and w != "SCHEDULE 4"])
Output:
['ABACAVIR.', 'ABATACEPT.', 'ABIRATERONE ACETATE.', 'ABCIXIMAB.', 'ABEMACICLIB.', 'ACALABRUTINIB.', 'ACAMPROSATE CALCIUM.', 'ACARBOSE.', 'ACEBUTOLOL.', 'ACEPROMAZINE.', 'ACETARSOL.', 'ACETAZOLAMIDE.', 'ACETOHEXAMIDE.', 'ACETYL ISOVALERYLTYLOSIN.', 'ACETYLCARBROMAL.', 'ACETYLCHOLINE.', 'ACETYLDIGITOXIN.', 'ACETYLMETHYLDIMETHYLOXIMIDOPHENYLHYDRAZINE.', 'ACETYLSTROPHANTHIDIN.', 'ACIPIMOX.', '# ACITRETIN.', 'ACLIDINIUM BROMIDE.', 'ACOKANTHERA OUABAIO.', 'ACOKANTHERA SCHIMPERI.', 'ACRIVASTINE.', 'ADALIMUMAB.', 'ADAPALENE.', 'ADEFOVIR.', 'ADIPHENINE.', 'ADONIS VERNALIS.', 'ADRAFINIL.', 'AFAMELANOTIDE.', 'AFATINIB DIMALEATE.'
and so on...

Scrape Multiple URLs using Beautiful Soup

I'm trying to extract specific classes from multiple URLs. The tags and classes stay the same but I need my python program to scrape all as I just input my link.
Here's a sample of my work:
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
url = input('insert URL here: ')
#scrape elements
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
This works for individual URLs but not for a batch. Thanks for helping me. I learned a lot from this community.
Have a list of urls and iterate through it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
If you are going to prompt user for input for each site then it can be done this way
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
msg = 'Enter Url, to exit type q and hit enter.'
url = input(msg)
while(url!='q'):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
input(msg)
If you want to scrape links in batches. Specify a batch size and iterate over it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
batch_size = 5
urllist = ["url1", "url2", "url3", .....]
url_chunks = [urllist[x:x+batch_size] for x in xrange(0, len(urllist), batch_size)]
def scrape_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
h1 = soup.find("h1", class_= "class-headline")
return (h1.get_text())
def scrape_batch(url_chunk):
chunk_resp = []
for url in url_chunk:
chunk_resp.append(scrape_url(url))
return chunk_resp
for url_chunk in url_chunks:
print scrape_batch(url_chunk)

Categories

Resources