python regex to split string at <a> elements and extract link + text

python regex to split string at <a> elements and extract link + text - python

Let's say I have several <a> elements in string:
s = 'Hello world. StackOverflow is a great website. ESPN is another great website.'
The goal is to split the string so I get a list similar to the one below:
l = [
"Hello world. ",
{"link": "https://stackoverflow.com/", "title": "StackOverflow"},
" is a great website. ",
{"link": "https://www.espn.com/", "title": "ESPN"},
" is another great website.",
]
The dictionaries can be any object I can extract the link and title from. Is there a regex I can use to accomplish this? Or is there a better way to do this?

BeautifulSoup is better tool to parse this string than regex. As general rule, don't use regex to parse HTML:
s = 'Hello world. StackOverflow is a great website. ESPN is another great website.'
from bs4 import BeautifulSoup, Tag, NavigableString
soup = BeautifulSoup(s, 'html.parser')
out = []
for c in soup.contents:
if isinstance(c, NavigableString):
out += [c]
elif isinstance(c, Tag) and c.name == 'a' and 'href' in c.attrs:
out += [{"link": c['href'], "title": c.text}]
from pprint import pprint
pprint(out)
Prints:
['Hello world. ',
{'link': 'https://stackoverflow.com/', 'title': 'StackOverflow'},
' is a great website. ',
{'link': 'https://www.espn.com/', 'title': 'ESPN'},
' is another great website.']

If you insist on using regex for this:
import re
s = 'Hello world. StackOverflow is a great website. ESPN is another great website.'
sites = [{"link": link, "title": title} for link, title in zip(re.findall(r'', s), re.findall(r'>(.*?)', s))]
print(sites)
Output:
[{'link': 'https://stackoverflow.com/', 'title': 'StackOverflow'}, {'link': 'https://www.espn.com/', 'title': 'ESPN'}]

Related

How to fix this error during scraping using BeautifulSoup?

I am trying to do web scraping using BeautifulSoup and requests Python library. I want to filter the news titles from Hacker News website but its showing an error while implementing.
import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titleline a')
subtext = soup.select('.subtext')
def create_custom_hn(links, subtext):
hn = []
for index, item in enumerate(links):
title = links[index].getText()
href = links[index].get('href', None)
votes = subtext[index].select('.score')
if len(votes):
points = int(votes[0].getText().replace(' points', ''))
print(points)
hn.append({'title': title, 'href': href})
return hn
print(create_custom_hn(links, subtext))
The error says
votes = subtext[index].select('.score')
~~~~~~~^^^^^^^
IndexError: list index out of range

Here is fixed version of the code from the question:
import requests
from bs4 import BeautifulSoup
res = requests.get("https://news.ycombinator.com/news")
soup = BeautifulSoup(res.text, "html.parser")
links = soup.select(".titleline > a")
def create_custom_hn(links):
hn = []
for link in links:
title = link.getText()
href = link.get("href", None)
votes = link.find_next(class_="score")
points = int(votes.getText().replace(" points", ""))
hn.append({"title": title, "href": href, "points": points})
return hn
print(create_custom_hn(links))
Prints:
[
{
"title": "Urllib3 in 2022",
"href": "https://sethmlarson.dev/urllib3-in-2022",
"points": 97,
},
{
"title": "First public release of Pushup: a new compiler for making web apps in Go",
"href": "https://github.com/adhocteam/pushup",
"points": 18,
},
{
"title": "Intelligence – A good collection of great OSINT Resources",
"href": "https://github.com/ARPSyndicate/awesome-intelligence",
"points": 113,
},
{
"title": "Microsoft is preparing to add ChatGPT to Bing",
"href": "https://www.bloomberg.com/news/articles/2023-01-04/microsoft-hopes-openai-s-chatbot-will-make-bing-smarter",
"points": 760,
},
...and so on.

Try to select your elements more specific, your selection of soup.select('.titleline a') includes more elements (60) as you may like to select (30):
[Urllib3 in 2022,
<span class="sitestr">sethmlarson.dev</span>,...]
I would also recommend to iterate the elements in another way, so you would become able to handle missing values.
Example
import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text)
data = []
for e in soup.select('tr.athing'):
data.append({
'title':e.select_one('.titleline a').get_text(),
'url':e.select_one('.titleline a').get('href'),
'votes':e.find_next(class_='subtext').text.split()[0]
})
print(data)
Output
[{'title': 'Urllib3 in 2022', 'url': 'https://sethmlarson.dev/urllib3-in-2022', 'votes': '93'}, {'title': 'First public release of Pushup: a new compiler for making web apps in Go', 'url': 'https://github.com/adhocteam/pushup', 'votes': '16'}, {'title': 'Intelligence – A good collection of great OSINT Resources', 'url': 'https://github.com/ARPSyndicate/awesome-intelligence', 'votes': '109'}, {'title': 'Microsoft is preparing to add ChatGPT to Bing', 'url': 'https://www.bloomberg.com/news/articles/2023-01-04/microsoft-hopes-openai-s-chatbot-will-make-bing-smarter', 'votes': '755'}, {'title': 'Juan Tamariz, the godfather of close-up card magic', 'url': 'https://www.nytimes.com/2023/01/02/magazine/juan-tamariz-magic.html', 'votes': '31'}, {'title': 'The Expanding Dark Forest and Generative AI', 'url': 'https://maggieappleton.com/ai-dark-forest', 'votes': '223'}, {'title': 'Irreconcilable differences between local and distributed computing (1994)', 'url': 'https://scholar.harvard.edu/waldo/publications/note-distributed-computing', 'votes': '29'},...]

Iterate and extract info over div class

import requests
from bs4 import BeautifulSoup
url = "https://boulder.noshdelivery.co/restaurants"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
restaurant_wrapper = soup.find(class_ = "dd_rest_list")
restaurants = restaurant_wrapper.find_all(class_="menu__vendor-name")
restaurant_street_address = restaurant_wrapper.find("span", itemprop="streetAddress")
restaurant_address_locality = restaurant_wrapper.find("span", itemprop="addressLocality")
def extract_restaurant_data(restaurant):
restaurant_title = restaurant_wrapper.find(class_="menu__vendor-name")
return {
"title" : restaurant_title.text.strip(),
"streetAddress": restaurant_street_address.text.strip(),
"addressLocality": restaurant_address_locality.text.strip()
}
results = [extract_restaurant_data(restaurant) for restaurant in restaurants]
print(results)
I would like to know why this code, prints exactly the same info and does not iterate over the list of restaurants.
My output is this
{'title': '5280 Cafe At Rallysport', 'streetAddress': '2727 29th St.',
'addressLocality': 'Boulder'},
{'title': '5280 Cafe At Rallysport', 'streetAddress': '2727 29th St.', ' addressLocality': 'Boulder'}........
The info is the same. I do not know why my code does not iterate over the different names from the list of "restaurants"

You only did one find the data. Do a find_all on each section and then zip them together!
restaurant_details = zip(
restaurant_wrapper.find_all(class_="menu__vendor-name"),
restaurant_wrapper.find_all("span", itemprop="streetAddress"),
restaurant_wrapper.find_all("span", itemprop="addressLocality"),
)
results = [
{
"title": title.text.strip(),
"streetAddress": street_address.text.strip(),
"addressLocality": address_locality.text.strip()
}
for title, street_address, address_locality in restaurant_details
]
print(results)

You function has restaurant_wrapper.find(class_="menu__vendor-name") written in it, so each time it runs it would print only the first occurence of the class menu__vendor-name.
To print a new restaurant's detail in each iteration you would have to access each web element individually.
The code below would allow you to get the details for all restaurants.
restwords = restaurant_wrapper.find_all("div", {"class": "dd_restwords"})
def extract_restaurant_data(restaurant):
title = restaurant.find("div", {"class": "menu__vendor-name"}).text
streetAddress = restaurant.find("span", {"itemprop": "streetAddress"}).text
addressLocality = restaurant.find("span", {"itemprop": "addressLocality"}).text
rest_data = {
"title": title,
"streetAddress": streetAddress,
"addressLocality": addressLocality
}
return rest_data
for restaurant in restwords:
print(extract_restaurant_data(restaurant))

How can I only parse the first HTML block from multiple blocks, if they all contain the same class-name?

I need to parse info from a site, on this site, there are 2 blocks, "Today" and "Yesterday", and they have the same class name of standard-box standard-list.
How can I only parse the first block (under "Today") in a row, without extracting the inform from "Yesterday", if they both contain the same class-name?
Here is my code:
import requests
url_news = "https://www.123.org/"
response = requests.get(url_news)
soup = BeautifulSoup(response.content, "html.parser")
items = soup.findAll("div", class_="standard-box standard-list")
news_info = []
for item in items:
news_info.append({
"title": item.find("div", class_="newstext",).text,
"link": item.find("a", class_="newsline article").get("href")
})

When running your provided code, I don't get an output for items. However, you said that you do, so:
If you only want to get the data under "Today", you can use .find() instead of .find_all(), since .find() will only return the first found tag -- which is "Today" and not the other tags.
So, instead of:
items = soup.findAll("div", class_="standard-box standard-list")
Use:
items = soup.find("div", class_="standard-box standard-list")
Additionally, to find the link, I needed to access the attribute using tag-name[attribute]. Here is working code:
news_info = []
items = soup.find("div", class_="standard-box standard-list")
for item in items:
news_info.append(
{"title": item.find("div", class_="newstext").text, "link": item["href"]}
)
print(news_info)
Output:
[{'title': 'NIP crack top 3 ranking for the first time in 5 years', 'link': 'https://www.hltv.org/news/32545/nip-crack-top-3-ranking-for-the-first-time-in-5-years'}, {'title': 'Fessor joins Astralis Talent', 'link': 'https://www.hltv.org/news/32544/fessor-joins-astralis-talent'}, {'title': 'Grashog joins AGO', 'link': 'https://www.hltv.org/news/32542/grashog-joins-ago'}, {'title': 'ISSAA parts ways with Eternal Fire', 'link': 'https://www.hltv.org/news/32543/issaa-parts-ways-with-eternal-fire'}, {'title': 'BLAST Premier Fall Showdown Fantasy live', 'link': 'https://www.hltv.org/news/32541/blast-premier-fall-showdown-fantasy-live'}, {'title': 'FURIA win IEM Fall NA, EG claim final Major Legends spot', 'link': 'https://www.hltv.org/news/32540/furia-win-iem-fall-na-eg-claim-final-major-legends-spot'}]

Finding all URLs in a single line

I'm trying to to fetch a page that has many urls and other stuff all in just one line in a plain text like
"link_url":"http://www.example.com/link1?site=web","mobile_link_url":"http://m.example.com/episode/link1?site=web" link_url":"http://www.example.com/link2?site=web","mobile_link_url":"http://m.example.com/episode/link2?site=web"
i tired
import re
import requests as req
response = req.get("http://api.example.com/?callback=jQuery112")
content = response.text
print content will give me the "link_url": output
but i need to find
http://www.example.com/link1?site=web
http://www.example.com/link2?site=web
and output only link1 and link2 to a file like
link1
link2
link3

The code below might be what you need.
import re
urls = '''"link_url":"http://www.example.com/link1?site=web","mobile_link_url":"http://m.example.com/episode/link1?site=web" link_url":"http://www.example.com/link2?site=web","mobile_link_url":"http://m.example.com/episode/link2?site=web"'''
links = re.findall(r'http://www[a-z/.?=:]+(link\d)+', urls)
print(links)

If it is a string and not a JSON object, then you could do this even though it's a bit hacky:
s1 ="\"link_url\":\"http://www.example.com/link1?site=web\",\"mobile_link_url\":\"http://m.example.com/episode/link1?site=web\" link_url\":\"http://www.example.com/link2?site=web\",\"mobile_link_url\":\"http://m.example.com/episode/link2?site=web\""
links = [x for x in s1.replace("\":\"", "LINK_DELIM").replace("\"", "").replace(" ", ",").split(",")]
for link in links:
print(link.split("LINK_DELIM")[1])
Which yields:
http://www.example.com/link1?site=web
http://m.example.com/episode/link1?site=web
http://www.example.com/link2?site=web
http://m.example.com/episode/link2?site=web
Though I think #al76's answer is more elegant for this.
But if it's a JSON which looks like:
[
{
"link_url": "http://www.example.com/link1?site=web",
"mobile_link_url": "http://m.example.com/episode/link1?site=web"
},
{
"link_url": "http://www.example.com/link2?site=web",
"mobile_link_url": "http://m.example.com/episode/link2?site=web"
}
]
Then you could do something like:
import json
s1 = "[{ \"link_url \": \"http://www.example.com/link1?site=web \", \"mobile_link_url \": \"http://m.example.com/episode/link1?site=web \"}, { \"link_url \": \"http://www.example.com/link2?site=web \", \"mobile_link_url \": \"http://m.example.com/episode/link2?site=web \"} ]"
data = json.loads(s1)
links = [y for x in data for y in x.values()]
for link in links:
print(link)

If this is a JSON api then you can use response.json() to get a python dictionary, as .text will give you the response as one long string.
You also do not need to use regex for something so simple, python comes with a url parser out of the box.
So provided your response is something like
[
{
"link_url": "http://www.example.com/link1?site=web",
"mobile_link_url": "http://m.example.com/episode/link1?site=web"
},
{
"link_url": "http://www.example.com/link2?site=web",
"mobile_link_url": "http://m.example.com/episode/link2?site=web"
}
]
(doesn't matter if IRL it's one line, as long as it's valid JSON)
You can iterate the results as a dictionary, then use urlparse to get specific components of your urls:
from urllib.parse import urlparse
import requests
response = requests.get("http://api.example.com/?callback=jQuery112")
for urls in response.json():
print(urlparse(url["link_url"]).path.rsplit('/', 1)[-1])
urlparse(...).path will return the path of your url only, eg. episode/link1, and we then we just get the last segment of that with rsplit to just get link1, link2 etc.

try
urls=""" "link_url":"http://www.example.com/link1?site=web","mobile_link_url":"http://m.example.com/episode/link1?site=web" link_url":"http://www.example.com/link2?site=web","mobile_link_url":"http://m.example.com/episode/link2?site=web" """
re.findall(r'"http://www[^"]+"',urls)

urls=""" "link_url":"http://www.example.com/link1?site=web","mobile_link_url":"http://m.example.com/episode/link1?site=web" link_url":"http://www.example.com/link2?site=web","mobile_link_url":"http://m.example.com/episode/link2?site=web" """
p = [i.split('":')[1] for i in urls.replace(' ', ",").split(",")[1:-1]]
#### Output ####
['"http://www.example.com/link1?site=web"',
'"http://m.example.com/episode/link1?site=web"',
'"http://www.example.com/link2?site=web"',
'"http://m.example.com/episode/link2?site=web"']
*Not as efficient as regex.

BeautifulSoup fails to parse nested <p> elements

Dependencies:
BeautifulSoup==3.2.1
In: from BeautifulSoup import BeautifulSoup
In: BeautifulSoup('<p><p>123</p></p>')
Out: <p></p><p>123</p>
Why are the two adjacent tags not in the output?

That is just BS3's parser fixing your broken html.
The P element represents a paragraph. It cannot contain block-level
elements (including P itself).

This
<p><p>123</p></p>
is not valid HTML. ps can't be nested. BS tries to clean it up.
When BS encounters the second <p> it thinks the first p is finished, so it inserts a closing </p>. The second </p> in your input then does not match an opening <p> so it is removed.

This is because BeautifulSoup has this NESTABLE_TAGS concept/setting:
When Beautiful Soup is parsing a document, it keeps a stack of open
tags. Whenever it sees a new start tag, it tosses that tag on top of
the stack. But before it does, it might close some of the open tags
and remove them from the stack. Which tags it closes depends on the
qualities of tag it just found, and the qualities of the tags in the
stack.
So when Beautiful Soup encounters a <P> tag, it closes and pops all
the tags up to and including the previously encountered tag of the
same type. This is the default behavior, and this is how
BeautifulStoneSoup treats every tag. It's what you get when a tag is
not mentioned in either NESTABLE_TAGS or RESET_NESTING_TAGS. It's also
what you get when a tag shows up in RESET_NESTING_TAGS but has no
entry in NESTABLE_TAGS, the way the <P> tag does.
>>> pprint(BeautifulSoup.NESTABLE_TAGS)
{'bdo': [],
'blockquote': [],
'center': [],
'dd': ['dl'],
'del': [],
'div': [],
'dl': [],
'dt': ['dl'],
'fieldset': [],
'font': [],
'ins': [],
'li': ['ul', 'ol'],
'object': [],
'ol': [],
'q': [],
'span': [],
'sub': [],
'sup': [],
'table': [],
'tbody': ['table'],
'td': ['tr'],
'tfoot': ['table'],
'th': ['tr'],
'thead': ['table'],
'tr': ['table', 'tbody', 'tfoot', 'thead'],
'ul': []}
As a workaround, you can allow p tag to be inside p:
>>> from BeautifulSoup import BeautifulSoup
>>> BeautifulSoup.NESTABLE_TAGS['p'] = ['p']
>>> BeautifulSoup('<p><p>123</p></p>')
<p><p>123</p></p>
Also, BeautifulSoup 3rd version is no longer maintained - you should switch to BeautifulSoup4.
When using BeautifulSoup4, you can change the underlying parser to change the behavior:
>>> from bs4 import BeautifulSoup
>>> BeautifulSoup('<p><p>123</p></p>')
<html><body><p></p><p>123</p></body></html>
>>> BeautifulSoup('<p><p>123</p></p>', 'html.parser')
<p><p>123</p></p>
>>> BeautifulSoup('<p><p>123</p></p>', 'xml')
<?xml version="1.0" encoding="utf-8"?>
<p><p>123</p></p>
>>> BeautifulSoup('<p><p>123</p></p>', 'html5lib')
<html><head></head><body><p></p><p>123</p><p></p></body></html>

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

python regex to split string at <a> elements and extract link + text - python

Related

How to fix this error during scraping using BeautifulSoup?

Iterate and extract info over div class

How can I only parse the first HTML block from multiple blocks, if they all contain the same class-name?

Finding all URLs in a single line

BeautifulSoup fails to parse nested <p> elements

Categories

Resources