extracting url using beautifulsoup

extracting url using beautifulsoup - python

With this code:
url = "https://github.com/searcho=desc&p=1&q=stars%3A%3E1&s=stars&type=Repositoris"
with urllib.request.urlopen(url) as response:
html = response.read()
html = html.decode('utf-8')
with open('page_content.html', 'w', encoding='utf-8') as new_file:
new_file.write(html)
soup = BeautifulSoup(html,'lxml')
g_data= soup.findAll("a", {"class":"v-align-middle"})
print(g_data[0])
The output is:
<a class="v-align-middle" data-hydro-click='{"event_type":"search_result.click","payload":{"page_number":1,"query":"stars:>1","result_position":1,"click_id":28457823,"result":{"id":28457823,"global_relay_id":"MDEwOlJlcG9zaXRvcnkyODQ1NzgyMw==","model_name":"Repository","url":"https://github.com/freeCodeCamp/freeCodeCamp"},"originating_request_id":"ECC6:1DF24:CE9C0F:1667572:5A8DDD6F"}}' data-hydro-hmac="42c4e038b86cefc302d5637e870e6d746ee7fa95eadf2b26930cb893c6a3bc53" href="/freeCodeCamp/freeCodeCamp">freeCodeCamp/freeCodeCamp</a>
How do I extract below url from the output:
https://github.com/freeCodeCamp/freeCodeCamp
Thanks!

Get the value of the attribute, json.loads() it and work with it as a regular python dict:
import json
# your other code, up to setting the g_data
data_hydro = g_data[0]['data-hydro-click']
data_hydro = json.loads(data_hydro)
print(data_hydro['payload']['result']['url'])

It's inside a json string, that's why it's hard to get at
html = """
<h3>
freeCodeCamp/freeCodeCamp
</h3>
"""
soup = BeautifulSoup(html, 'lxml')
parsed_json = json.loads(soup.a.get('data-hydro-click'))
parsed_json['payload']['result']['url']
# returns 'https://github.com/freeCodeCamp/freeCodeCamp'

Related

Get data from bs4 script

I am new to programming and i am trying to parse this page: https://ruz.spbstu.ru/faculty/100/groups
url = "https://ruz.spbstu.ru/faculty/100/groups"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
scripts = soup.find_all('script')
print(scripts[3].text)
this gives me
window.__INITIAL_STATE__ = {"faculties":{"isFetching":false,"data":null,"errors":null},"groups":{"isFetching":false,"data":{"100":[{"id":35754,"name":"3733806/00301","level":3,"type":"common","kind":0,"spec":"38.03.06 Торговое дело","year":2022},{"id":35715,"name":"3763801/10103","level":2,"type":"common","kind":3,"spec":"38.06.01 Экономика","year":2022},{"id":34725,"name":"з3753801/80430_2021","level":5,"type":"distance","kind":2,"spec":"38.05.01 Экономическая безопасность","year":2022},{"id":33632,"name":"3733801/10002_2021","level":2,"type":"common","kind":0,"spec":"38.03.01 Экономика","year":2022}...........
contents are very long so this is an extract from the output.
i need get all 'id's and 'name's from this output and put them into the dictionary like {id:name}, i can't figure out a way how to do it.
Any information will be very helpful.

Try:
import re
import json
import requests
from bs4 import BeautifulSoup
url = "https://ruz.spbstu.ru/faculty/100/groups"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
scripts = soup.find_all("script")
data = re.search(r".*?({.*});", scripts[3].text).group(1)
data = json.loads(data)
out = {d["id"]: d["name"] for d in data["groups"]["data"]["100"]}
print(out)
Prints:
{35754: '3733806/00301', 35715: '3763801/10103', ...etc.

Is there a way to extract only the string of a paragraph when webscraping in python?

We want to know if there is a way to extract only the string of a paragraph when web scraping in python?
The problem lies in our definition of 'price'
Anastacia <3
Code:
import requests
from bs4 import BeautifulSoup
def scraper(url):
url.status_code
url.headers
c = url.content
soup = BeautifulSoup(c, "html.parser")
samples2 = soup.find_all(class_="product-price font-weight-bold mb-0")
for p in samples2:
price = (p)
print(price)
print()
url = "https://www.bevco.dk/spiritus/"
url = requests.get(url)
scraper(url)

You only have to modify the variable price. Extract the text from between the tags and for esthetic purposes, strip the extracted string.
import requests
from bs4 import BeautifulSoup
def scraper(url):
url.status_code
url.headers
c = url.content
soup = BeautifulSoup(c, "html.parser")
samples2 = soup.find_all(class_="product-price font-weight-bold mb-0")
for p in samples2:
price = (p)
print(price.get_text().strip())
print()
url = "https://www.bevco.dk/spiritus/"
url = requests.get(url)
scraper(url)

import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata("https://www.bevco.dk/spiritus")
soup = BeautifulSoup(htmldata, 'html.parser')
data = ''
for data in soup.find_all("p",{'class': 'product-price font-weight-bold mb-0'}):
print(data.get_text()) `enter code here`

why is my scraper showing 'none' as the output?

from bs4 import BeautifulSoup
import requests
url = 'https://www.flipkart.com'
data = {
'q': 'laptops'
}
html_code = requests.post(url, data=data).text
soup = BeautifulSoup(html_code, 'lxml')
titles = soup.findAll('div', class_='_3wU53n')
for title in titles:
print(title, end='\n')

you can try this code
import requests
from bs4 import BeautifulSoup
to_search = 'laptops'
url = 'https://www.flipkart.com/search?q=
{}&otracker=search&otracker1=search&marketplace=FLIPKART&as-
show=on&as=off'.format(to_search)
html_code = requests.get(url).text
soup = BeautifulSoup(html_code, 'lxml')
titles = soup.findAll('div', class_='_3wU53n')
for title in titles:
print(title, end='\n')

Can you please check again. Firstly, The url 'https://www.flipkart.com' is actually a 'GET' request and not 'POST'. Secondly, the class: _3wU53n which you have provided is actually not present in the HTML DOM. Have a look in the below screenshot for the 'GET' request and 'Absence of class: _3wU53n' in the HTML DOM.

extract a html ID from inside a tag using beautiful soup python

I am trying to extract only an iid code in html so that i can append it to a url and open the page I need.
I can find the tag I need by specifying the class of the tag. However I also get 4 other tags in the output. All i want is the iid inside the first tag "183988596953"
I have tried using this code to specify only the idd
rslt_table = soup.find_all("iid",{"div class": "lvpic pic img left"})
This however only seems to return an empty list []
The output i get when repacing the line of code above with 2nd last line of code below is the output with 4 tags I mentioned
from bs4 import BeautifulSoup
import requests
import re
urls = ['https://www.ebay.co.uk/sch/i.html?_from=R40&_trksid=m570.l1313&_nkw=goldfinger+quad']
#https://www.ebay.co.uk/sch/i.html?_from=R40&_trksid=m570.l1313&_nkw=
def find_id(urls):
for url in urls:
session = requests.session()
response = session.get(url)
#soup = BeautifulSoup(response.content, "lxml")
soup = BeautifulSoup(response.content, "html.parser")
rslt_table = soup.find("div", {"class": "lvpic pic img left"})
return(rslt_table)
My search url is https://www.ebay.co.uk/sch/i.html?_from=R40&_trksid=m570.l1313&_nkw=goldfinger+quad'
Full outpt is
<div class="lvpic pic img left" iid="183988596953">
<div class="lvpicinner full-width picW">
<a class="img imgWr2" href="https://www.ebay.co.uk/itm/GOLDFINGER-1964-Style-A-B-UK-Cinema-High-Quality-Repro-30-x-40-quad-poster/183988596953?hash=item2ad69330d9:g:rYQAAOSwrENdbmEW">
<img alt='GOLDFINGER 1964 Style A & B - UK Cinema High Quality Repro 30"x 40" quad poster' class="img" src="https://i.ebayimg.com/thumbs/images/g/rYQAAOSwrENdbmEW/s-l225.jpg"/>
</a>
</div></div>

Your code updated:
Use attrs to return all the attributes
{'class': ['lvpic', 'pic', 'img', 'left'], 'iid': '183988596953'}
def find_id(urls):
for url in urls:
session = requests.session()
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
return soup.find("div", {"class": "lvpic pic img left"}).attrs['iid']
iid = find_id(urls)
print(iid)
>>> '183988596953'
If you want all iid:
def find_id(urls):
for url in urls:
session = requests.session()
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
div = s.find_all("div", attrs={'class': 'lvpic pic img left'})
return [iid.attrs['iid'] for iid in div]

Beautifulsoup parsing html line breaks

I'm using BeautifulSoup to parse some HTML from a text file. The text is written to a dictionary like so:
websites = ["1"]
html_dict = {}
for website_id in websites:
with codecs.open("{}/final_output/raw_html.txt".format(website_id), encoding="utf8") as out:
get_raw_html = out.read().splitlines()
html_dict.update({website_id:get_raw_html})
I parse the HTML from html_dict = {} to find texts with the <p> tag:
scraped = {}
for website_id in html_dict.keys():
scraped[website_id] = []
raw_html = html_dict[website_id]
for i in raw_html:
soup = BeautifulSoup(i, 'html.parser')
scrape_selected_tags = soup.find_all('p')
This is what the HTML in html_dict looks like:
<p>Hey, this should be scraped
but this part gets ignored for some reason.</p>
The problem is, BeautifulSoup seems to be considering the line break and ignoring the second line. So when i print out scrape_selected_tags the output is...
<p>Hey, this should be scraped</p>
when I would expect the whole text.
How can I avoid this? I've tried splitting the lines in html_dict and it doesn't seem to work. Thanks in advance.

By calling splitlines when you read your html documents you break the tags in a list of strings.
Instead you should read all the html in a string.
websites = ["1"]
html_dict = {}
for website_id in websites:
with codecs.open("{}/final_output/raw_html.txt".format(website_id), encoding="utf8") as out:
get_raw_html = out.read()
html_dict.update({website_id:get_raw_html})
Then remove the inner for loop, so you won't iterate over that string.
scraped = {}
for website_id in html_dict.keys():
scraped[website_id] = []
raw_html = html_dict[website_id]
soup = BeautifulSoup(raw_html, 'html.parser')
scrape_selected_tags = soup.find_all('p')
BeautifulSoup can handle newlines inside tags, let me give you an example:
html = '''<p>Hey, this should be scraped
but this part gets ignored for some reason.</p>'''
soup = BeautifulSoup(html, 'html.parser')
print(soup.find_all('p'))
[<p>Hey, this should be scraped\nbut this part gets ignored for some reason.</p>]
But if you split one tag in multiple BeautifulSoup objects:
html = '''<p>Hey, this should be scraped
but this part gets ignored for some reason.</p>'''
for line in html.splitlines():
soup = BeautifulSoup(line, 'html.parser')
print(soup.find_all('p'))
[<p>Hey, this should be scraped</p>]
[]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

extracting url using beautifulsoup - python

Get the value of the attribute, json.loads() it and work with it as a regular python dict: import json # your other code, up to setting the g_data data_hydro = g_data[0]['data-hydro-click'] data_hydro = json.loads(data_hydro) print(data_hydro['payload']['result']['url'])

It's inside a json string, that's why it's hard to get at html = """ <h3> freeCodeCamp/freeCodeCamp </h3> """ soup = BeautifulSoup(html, 'lxml') parsed_json = json.loads(soup.a.get('data-hydro-click')) parsed_json['payload']['result']['url'] # returns 'https://github.com/freeCodeCamp/freeCodeCamp'

Related

Get data from bs4 script

Is there a way to extract only the string of a paragraph when webscraping in python?

why is my scraper showing 'none' as the output?

extract a html ID from inside a tag using beautiful soup python

Beautifulsoup parsing html line breaks

Categories

Resources