Extracting links and title from several pages - python

Im trying to build my own rss with downloadlinks
but the rss feed provides only the link to the whole season.
i'm taking that link to the whole season and want to extract the specific downloadlink to the episode itself (uploaded/ul)
this is what ive got so far.
any possibilities to get that working ?!
import feedparser, urllib2, re
from BeautifulSoup import BeautifulSoup
episodenliste = ['Game.of.Thrones','Arrow']
episode_link = []
episode_title = []
d = feedparser.parse('http://serienjunkies.org/xml/feeds/episoden.xml')
for post in d.entries:
if ('DEUTSCH' in post.title) and any (word in post.title for word in episodenliste) and ('720p' in post.title):
post.title = post.title.replace('[DEUTSCH] ','')
post.title = re.sub(r'(.*S\d+E\d+)(.*)',r'\1' ,post.title)
episode_link.append(post.link)
episode_title.append(post.title)
print post.title + ": " + post.link + "\n"
for search_title in episode_title:
for get_dlLink in episode_link:
page_ = urllib2.Request(get_dlLink)
page = urllib2.urlopen(page_).read()
soup = BeautifulSoup(page)
print search_title
title = soup.find('strong', text=search_title)
if title is not None:
print title
# link = title.parent
# links = link.find_all('a')
# print links
# for link2 in links:
# url = link2['href']
# print url
# pattern = 'http:\/\/download\.serienjunkies\.org.*%s_.*\.html' % ul
# if re.match(pattern, url):
# print url
as far as i can tell it works to that point where im searching the page for the title.
it gets to the pages parsed from the rss. but it doesnt find the title.
my idea was:
first find the title and than extract the 'children'/links from it
any help is appreciated
thanks in advance

Wihout JavaScript enabled the HTML looks quite different:
<p><strong>Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS</strong><br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-55bc328624d93658/fm_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | filemonkey.in<br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-25023a87144345f9/so_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | share-online.biz<br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-3e8ea978a2cf7bda/ul_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | uploaded.to</p>
As the title from the RSS feed without the [DEUTSCH] prefix is the first text in the paragraph on the page for the series, it can be the basis for searching and extracting the entry. Two elements up there is the <p> tag containing all the data for the episode. And that's links followed by the name of the file hoster.
import feedparser
import requests
from bs4 import BeautifulSoup
FEED_URL = 'http://serienjunkies.org/xml/feeds/episoden.xml'
def is_interesting_entry(entry, title_prefix, series_names):
return (
entry.title.startswith(title_prefix)
and any(name in entry.title for name in series_names)
)
def process_entry(entry, title_prefix):
if not entry.title.startswith(title_prefix):
raise ValueError(
'expected prefix {0!r} not found in {1!r}'.format(
title_prefix, entry.title
)
)
return (entry.title[len(title_prefix):], entry.link)
def process_feed(feed_url, title_prefix, series_names):
return (
process_entry(entry, title_prefix)
for entry in feedparser.parse(feed_url).entries
if is_interesting_entry(entry, title_prefix, series_names)
)
def get_series_soup(url, cache=dict()):
if url in cache:
return cache[url]
else:
result = BeautifulSoup(requests.get(url).text)
cache[url] = result
return result
def get_download_urls(soup, title):
title_text = soup.find(text=title)
if not title_text:
return dict()
else:
return dict(
(a_tag.next_sibling.strip('| '), a_tag['href'])
for a_tag in title_text.parent.parent('a')
)
def main():
series_names = ['Game.of.Thrones', 'Arrow']
for title, url in process_feed(FEED_URL, '[DEUTSCH] ', series_names):
print
print title
hoster2url = get_download_urls(get_series_soup(url), title)
if hoster2url:
for hoster, download_url in sorted(hoster2url.iteritems()):
print '{0:>20s}: {1}'.format(hoster, download_url)
else:
print ' --- No downloads ---'
if __name__ == '__main__':
main()

PASTEBIN
<item>
<title>[DEUTSCH] Arrow.S02E14.Gegen.die.Zeit.GERMAN.DUBBED.720p.HDTV.x264-ZZGtv</title>
<description>[DEUTSCH] Arrow.S02E14.Gegen.die.Zeit.GERMAN.DUBBED.720p.HDTV.x264-ZZGtv</description>
<pubDate>Fri, 18 Jul 2014 00:00:00 +0200</pubDate>
<link>http://serienjunkies.org/arrow/arrow-staffel-2-hdtvweb-dl-sd720p1080p/</link>
</item>
sorry, didnt know that
<p><strong>Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS</strong><br><div id="download_mirrors" class="download_main"><strong>Download:</strong> uploaded.net <span style="font-size:10px">(best speed) </span><br><strong style="margin-left:14px">Mirrors:</strong> <img src="http://serienjunkies.org/media/img/stream/application_cascade.png" style="cursor:pointer;" title="Mirrors zeigen" onclick="toggle("Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS");"><div id="Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS" style="display: none;">
<strong style="margin-left:20px">Mirror:</strong> filemonkey.in<br>
<strong style="margin-left:20px">Mirror:</strong> share-online.biz<br>
</div><div><strong style="margin-left:18px">Usenet:</strong> Highspeed Mirror</div></div></p>

Related

can i get data in b tag under the a tag with selenium with python?

can i get data in b tag under the a tag with selenium with python?
if i can, how? would you tell me solution?
this is structure of html
...
<div class = "cont_inner">
<div class = "wrap_tit_ mg_tit">
<a href = "href="https://cp.news.search.daum.net/p/97048679" class"f_link_b" onclick="smartLog(this, "dc=NNS&d=26DQnlvsWTMHk5CtBf&pg=6&r=2&p=4&rc=10&e1=163cv75CcAF31EvlGD&e3=0&ext=dsid=26DQnlvsWTMHk5CtBf", event, {"cpid": {"value": "163cv75CcAF31EvlGD"}});" target = "_blank"> == $0
"하남지역자활센터,"
<b>보건복지부</b>
"간이평가 우수기관"
</a>
</div>
and i wanna get data as like
"하남지역자활센터, 보건복지부 간이평가우수기관"
this is my code state
[['"하남지역자활센터, , 간이평가 우수기관"']]
and this is my source code for crawling data on the website
class crwaler_daum:
def __init__(self):
self.title = []
self.body = []
self.url = input("please enter url for crawling data")
self.page = input('please enter number of page to get data')
def get_title(self):
return self.title
def set_title(self , title):
self.title.append(title)
def get_body(self):
return self.body
def set_body(self , body):
self.body.append(body)
def crwaling_title(self):
title_list = []
chrome_driver = webdriver.Chrome('D:/바탕 화면/인턴/python/crwaler/news_crawling/chromedriver.exe')
url = self.url
response = requests.get(url , verify = False)
root = lxml.html.fromstring(response.content)
chrome_driver.get(url)
for i in range(int(self.page) + 1):
for j in root.xpath('//*[#id="clusterResultUL"]/li'):
title_list.append((j.xpath('div[2]/div/div[1]/a/text()')))
print(title_list)
chrome_driver.get('https://search.daum.net/search?w=news&DA=PGD&enc=utf8&cluster=y&cluster_page=3&q=%EB%B3%B4%EA%B1%B4%EB%B3%B5%EC%A7%80%EB%B6%80&p={}'.format(i))
lxml has a built in function ".text_content()" which "Returns the text content of the element, including the text content of its children, with no markup.". But after using this function, you should manipulate the string to acquire it like you want. I hope you will understand what I mean better with code below, but it may not be quite practical because I'm a beginner in Python too but it solves problem for now.
import lxml.html
html = '''
<div class = "cont_inner">
<div class = "wrap_tit_ mg_tit">
<a href = "href="https://cp.news.search.daum.net/p/97048679" class"f_link_b" onclick="smartLog(this, "dc=NNS&d=26DQnlvsWTMHk5CtBf&pg=6&r=2&p=4&rc=10&e1=163cv75CcAF31EvlGD&e3=0&ext=dsid=26DQnlvsWTMHk5CtBf", event, {"cpid": {"value": "163cv75CcAF31EvlGD"}});" target = "_blank">
"하남지역자활센터,"
<b>보건복지부</b>
"간이평가 우수기관"
</a>
</div>'''
my_html = lxml.html.fromstring(html)
a_element = my_html.xpath('//div[#class="wrap_tit_ mg_tit"]/a')
print(a_element[0].text_content())
def prettify_string(string):
string = string.replace("\n", "").replace("\"", "").split(" ")
while "" in string:
string.remove("")
string = " ".join(string)
return string
"""
Prints:
"하남지역자활센터,"
보건복지부
"간이평가 우수기관"
"""
print(prettify_string(str(a_element[0].text_content())))
"""
Prints:
하남지역자활센터, 보건복지부 간이평가 우수기관
"""
I haven't used lxml crawler yet, but you can use BeautifulSoup instead.
from bs4 import BeautifulSoup
chrome_driver = webdriver.Chrome('your direction')
chrome_driver.get('your url')
html = chrome_driver.page_source
soup = BeautifulSoup(html, 'html.parser')
b_tag = soup.find_all('b')

Finding tag of text-searched element in HTML

I am trying to scrape multiple web pages to compare the prices of books. Because every site has a different layout (and class names), I want to find the title of the book using regex and then the surrounding elements. An example of the code is given below.
from bs4 import BeautifulSoup
import re
html_page1 = """
<div class='product-box'>
<h2 class='title'>Title Book</h2>
<p class='price>18.45</p>
</div>
"""
html_page2 = """
<div class='page-box'>
<h2 class='orange-heading'>Title Book</h2>
<p class='blue-price'>18.45</p>
</div>
"""
# turn page into soup
soup1 = BeautifulSoup(html_page1, 'html.parser')
# find book titles
names1 = soup1.find_all(string=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names1: ', names1)
# turn page into soup
soup2 = BeautifulSoup(html_page2, 'html.parser')
# find book titles
names2 = soup2.find_all(string=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names2: ', names2)
This returns:
Names1: ['Title Book']
Names2: ['Title Book']
Now I want to use this information to find the corresponding price. I know that when an element has been selected using the tags and class names, "next_sibling" can be used, however this doesn't work for the element selected by text:
select_title = soup1.find('h2', {"class": "title"})
next_sib = new_try.next_sibling
print(next_sib) # returns <p class='price>18.45
# now try the same thing on element selected by name, this will result in an error
next_sib = names1.next_sibling
How can I use the same method to find the price when I have found the element using its text?
A similiar question can be found here: Find data within HTML tags using Python However, it still uses the html tags.
EDIT The problem is that I have many pages with different layouts and class names. Because of that I cannot use the tag/class/id name to find the elements and I have to find the book titles using regex.
To get the price Include 'h2' tag while doing it find_all() And then use find_next('p')
The first example of p tag where string was missing for classname I have added the string class='price'.
from bs4 import BeautifulSoup
import re
html_page1 = """
<div class='product-box'>
<h2 class='title'>Title Book</h2>
<p class='price'>18.45</p>
</div>
"""
html_page2 = """
<div class='page-box'>
<h2 class='orange-heading'>Title Book</h2>
<p class='blue-price'>18.45</p>
</div>
"""
# turn page into soup
soup1 = BeautifulSoup(html_page1, 'html.parser')
# find book titles
names1 = soup1.find_all('h2',string=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names1: ', names1[0].find_next('p').text)
# turn page into soup
soup2 = BeautifulSoup(html_page2, 'html.parser')
# find book titles
names2 = soup2.find_all('h2',string=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names2: ', names2[0].find_next('p').text)
Or change string to text
from bs4 import BeautifulSoup
import re
html_page1 = """
<div class='product-box'>
<h2 class='title'>Title Book</h2>
<p class='price'>18.45</p>
</div>
"""
html_page2 = """
<div class='page-box'>
<h2 class='orange-heading'>Title Book</h2>
<p class='blue-price'>18.45</p>
</div>
"""
# turn page into soup
soup1 = BeautifulSoup(html_page1, 'html.parser')
# find book titles
names1 = soup1.find_all(text=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names1: ', names1[0].find_next('p').text)
# turn page into soup
soup2 = BeautifulSoup(html_page2, 'html.parser')
# find book titles
names2 = soup2.find_all(text=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names2: ', names2[0].find_next('p').text)
EDITED
Use text to get the element without tag and next_element to get the value of price.
from bs4 import BeautifulSoup
import re
html_page1 = """
<div class='product-box'>
<h2 class='title'>Title Book</h2>
<p class='price'>18.45</p>
</div>
"""
html_page2 = """
<div class='page-box'>
<h2 class='orange-heading'>Title Book</h2>
<p class='blue-price'>18.45</p>
</div>
"""
# turn page into soup
soup1 = BeautifulSoup(html_page1, 'html.parser')
# find book titles
names1 = soup1.find_all(text=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names1: ', names1[0])
print('Price1: ', names1[0].next_element.next_element.next_element)
# turn page into soup
soup2 = BeautifulSoup(html_page2, 'html.parser')
# find book titles
names2 = soup2.find_all(text=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names2: ', names2[0])
print('Price2: ', names2[0].next_element.next_element.next_element)
Output:
Names1: Title Book
Price1: 18.45
Names2: Title Book
Price2: 18.45
You missed class closure comma for the p.price in html_page1.
With names1 = soup1.find_all(text=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))")) you get NavigableString, that's why you'll get None for the next_sibling.
You can find solution with regex in #Kunduk answer.
Alternative more clear and simple solution for the both html_page1 and html_page2:
soup = BeautifulSoup(html_page1, 'html.parser')
# or BeautifulSoup(html_page2, 'html.parser')
books = soup.select('div[class*=box]')
for book in books:
book_title = book.select_one('h2').text
book_price = book.select_one('p[class*=price]').text
print(book_title, book_price)
div[class*=box] mean div where class contains box.

Getting length of sentence between tag in BeautifulSoup4

Am trying to collect some statistics from a website, what am trying to do is extract a word and count of neigbour words found within the same tag for example
Input
<div class="col-xs-12">
<p class="w50">Operating Temperature (Min.)[°C]</p>
<p class="w50 upperC">-40</p>
</div>
would result into
TAG 1
Operating , 2 i.e #<Temperature, (Min.)[°C]>
Temperature, 2 i.e #<Operating, (Min.)[°C]>
(Min.)[°C], 2 i.e #<Operating,Temperature>
TAG 2
-40, 0
this what I ended up to, but it extracts text as whole
url = 'https://www.rohm.com/products/wireless-communication/wireless-lan-modules/bp3580-product#'
with urllib.request.urlopen(url) as url:
page = url.read()
soup = BeautifulSoup(page, features='lxml')
# [print(tag.name) for tag in soup.find_all()]
for script in soup(["script", "style"]):
script.decompose() # rip it out
invalid_tags = ['br']
for tag in invalid_tags:
for match in soup.findAll(tag):
match.replaceWithChildren()
html = soup.find_all(recursive=False)
for tag in html:
print(tag.get_text())
I tried to with recursive = True but the result is duplicated alot
It might not be the result what you have exected for, but at least it gives you a hint. I modified your code a little bit.
url = 'https://www.rohm.com/products/wireless-communication/wireless-lan-modules/bp3580-product#'
with urllib.request.urlopen(url) as url:
page = url.read()
soup = BeautifulSoup(page, features='lxml')
for script in soup(["script", "style"]):
script.decompose() # rip it out
invalid_tags = ['br']
for tag in invalid_tags:
for match in soup.findAll(tag):
match.replaceWithChildren()
html = soup.find_all(recursive=False)
textlist = []
for tag in html:
text = tag.text.replace("\r","").replace("\t","").split("\n")
for t in text:
if t != '':
textlist.append(t)
for tt in textlist:
print(tt)
for ts in tt.split():
print ("{}, {}".format(ts,len(tt.split())-1))
print("-----------------------------")

Certain content not loading when scraping a site with Beautiful Soup

I'm trying to scrape the ratings off recipes on NYT Cooking but having issues getting the content I need. When I look at the source on the NYT page, I see the following:
<div class="ratings-rating">
<span class="ratings-header ratings-content">194 ratings</span>
<div class="ratings-stars-wrap">
<div class="ratings-stars ratings-content four-star-rating avg-rating">
The content I'm trying to pull out is 194 ratings and four-star-rating. However, when I pull in the page source via Beautiful Soup I only see this:
<div class="ratings-rating">
<span class="ratings-header ratings-content"><%= header %></span>
<div class="ratings-stars-wrap">
<div class="ratings-stars ratings-content <%= ratingClass %> <%= state %>">
The code I'm using is:
url = 'https://cooking.nytimes.com/recipes/1020049-lemony-chicken-soup-with-fennel-and-dill'
r = get(url, headers = headers, timeout=15)
page_soup = soup(r.text,'html.parser')
Any thoughts why that information isn't pulling through?
Try using below code
import requests
import lxml
from lxml import html
import re
url = "https://cooking.nytimes.com/recipes/1019706-spiced-roasted-cauliflower-with-feta-and-garlic?action=click&module=Recirculation%20Band%20Recipe%20Card&region=More%20recipes%20from%20Alison%20Roman&pgType=recipedetails&rank=1"
r = requests.get(url)
tree = html.fromstring(r.content)
t = tree.xpath('/html/body/script[14]')[0]
# look for value for bootstrap.recipe.avg_rating
m = re.search("bootstrap.recipe.avg_rating = ", t.text)
colon = re.search(";", t.text[m.end()::])
rating = t.text[m.end():m.end()+colon.start()]
print(rating)
# look for value for bootstrap.recipe.num_ratings =
n = re.search("bootstrap.recipe.num_ratings = ", t.text)
colon2 = re.search(";", t.text[n.end()::])
star = t.text[n.end():n.end()+colon2.start()]
print(star)
much easier to use attribute = value selectors to grab from span with class ratings-metadata
import requests
from bs4 import BeautifulSoup
data = requests.get('https://cooking.nytimes.com/recipes/1020049-lemony-chicken-soup-with-fennel-and-dill')
soup = BeautifulSoup(data.content, 'lxml')
rating = soup.select_one('[itemprop=ratingValue]').text
ratingCount = soup.select_one('[itemprop=ratingCount]').text
print(rating, ratingCount)

beautiful soup getting <s> tag or getting price

I want to get the price from this page
http://www.fastfurnishings.com/Aura-Floor-Lamp-p/lsi_ls-aurafl-gy-gn.htm
The price is within
<font class="text colors_text"><b>Retail Price:<s></b> $199.00 </font><br /><b><FONT class="pricecolor colors_productprice"></s>Price: <span itemprop='price'>$139.00</span> </font></b>
I want this price $139.00
I have the code below but it doesnt find the price
html = urllib2.urlopen(value)
soup = BS(html)
foundPrice = soup.findAll('span', {'itemprop':'price'})
if found is not None:
print "found a price"
else:
print" No Lunk"
In the following code:
foundPrice = soup.findAll('span', {'itemprop':'price'})
if found is not None:
You assigned result of findAll to foundPrice, but if statement compare found.
Try following:
import urllib2
from bs4 import BeautifulSoup
url = 'http://www.fastfurnishings.com/Aura-Floor-Lamp-p/lsi_ls-aurafl-gy-gn.htm'
u = urllib2.urlopen(url)
try:
soup = BeautifulSoup(u)
finally:
u.close()
span = soup.find('span', {'itemprop':'price'})
if span is not None:
print span.string
else:
print 'Not found'

Categories

Resources