I'm trying to parse an html result , grab a few urls, and then parse the output of visiting those urls.
I'm using django 1.5 /python 2.7:
views.py
#mechanize/beautifulsoup config options here.
beautifulSoupObj = BeautifulSoup(mechanizeBrowser.response().read()) #read the raw response
getFirstPageLinks = beautifulSoupObj.find_all('cite') #get first page of urls
url_data = UrlData(NumberOfUrlsFound, getDomainLinksFromGoogle)
#url_data = UrlData(5, 'myapp.com')
#return HttpResponse(MaxUrlsToGather)
print url_data.url_list()
return render(request, 'myapp/scan/process_scan.html', {
'url_data':url_data,'EnteredDomain':EnteredDomain,'getDomainLinksFromGoogle':getDomainLinksFromGoogle,
'NumberOfUrlsFound':NumberOfUrlsFound,
'getFirstPageLinks' : getFirstPageLinks,
})
urldata.py
class UrlData(object):
def __init__(self, num_of_urls, url_pattern):
self.num_of_urls = num_of_urls
self.url_pattern = url_pattern
def url_list(self):
# Returns a list of strings that represent the urls you want based on num_of_urls
# e.g. asite.com/?search?start=10
urls = []
for i in xrange(self.num_of_urls):
urls.append(self.url_pattern + '&start=' + str((i + 1) * 10) + ',')
return urls
template:
{{ getFirstPageLinks }}
{% if url_data.num_of_urls > 0 %}
{% for url in url_data.url_list %}
{{ url }}
{% endfor %}
{% endif %}
This outputs:
[<cite>www.google.com/webmasters/</cite>, <cite>www.domain.com</cite>, <cite>www.domain.comblog/</cite>, <cite>www.domain.comblog/projects/</cite>, <cite>www.domain.comblog/category/internet/</cite>, <cite>www.domain.comblog/category/goals/</cite>, <cite>www.domain.comblog/category/uncategorized/</cite>, <cite>www.domain.comblog/twit/2013/01/</cite>, <cite>www.domain.comblog/category/dog-2/</cite>, <cite>www.domain.comblog/category/goals/personal/</cite>, <cite>www.domain.comblog/category/internet/tech/</cite>]
which is generated by: getFirstPageLinks
and
https://www.google.com/search?q=site%3Adomain.com&start=10, https://www.google.com/search?q=site%3Adomain.com&start=20,
which is generated by: url_data a template variable
The problem currently is: I need to loop though each url in url_data and get the output like getFirstPageLinks is outputting it.
How can I achieve this?
Thank you.
Related
I'm trying to use django template for the first time and I need to export some data. How can I export a table in my django template to a .xlsx file? Is there any method? here is my views.py:
from django.shortcuts import render
import requests
from .utils import most_frequent, get_json_values, get_max_dict
def launches(request):
"""main request. Retrieve the year that had most launches"""
response_launches = requests.get('https://api.spacexdata.com/v3/launches?
filter=launch_year')
launches = response_launches.json()
launch_years = get_json_values('launch_year',launches)
result_launches = most_frequent(launch_years)
"""retrieve the launch site most used for launches """
response_sites = requests.get('https://api.spacexdata.com/v3/launches?
filter=launch_site')
sites = response_sites.json()
launch_sites = get_json_values("launch_site", sites)
result_sites = get_max_dict(launch_sites,'site_id')
"""retrieve the number of launches between 2019 and 2021"""
response_2019_2021 = requests.get('https://api.spacexdata.com/v3/launches?
start=2019&end=2021')
launches_2019_2021 = response_2019_2021.json()
result_2019_2021 = len(launches_2019_2021)
data = {
"year_most_launches": str(result_launches),
"launch_sites":result_sites,
"launches_2019_2021":str(result_2019_2021)
}
return render(request,"main/launches.html", {"data":data})
And that's my table inside my template:
<table class="table">
<thead>
<tr>
<th>Year with most launches</th>
<th>Launch site with most launches</th>
<th>Number of launches between 2019 and 2021</th>
</tr>
</thead>
<tbody>
<tr>
{% for element in data.values %}
<td>{{ element }}</td>
{% endfor %}
</tr>
</tbody>
</table>
I couldn't find a way to do it so any help is really appreciated!
In my case, I create a function in the view that handle all the data and insert it into a Excel file.
Let me give you the code, it may help you.
def export_excel(self, elements):
output = io.BytesIO()
workbook = Workbook(output, {'in_memory': True})
worksheet = workbook.add_worksheet()
# r = Row and c = Column
r = 0
c = 0
# Table header
table_headers = [
# Define your headers name here
''
]
for header in table_headers:
worksheet.write(r, c, header)
c += 1
r += 1
for element in elements:
# write your data in the excel following the same command
worksheet.write(r, 0, element.data_name)
r += 1
workbook.close()
output.seek(0)
content_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
response = HttpResponse(output.read(), content_type=content_type)
file_name = _('file_name')
response['Content-Disposition'] = "attachment; filename=" + file_name + ".xlsx"
return response
Let me know if it was useful
I have HTML list formed that way (It's what CKeditor create for nested list):
<ul>
<li>niv1alone</li>
<li>niv1
<ul>
<li>niv2
<ul>
<li>niv3
<ul>
<li>niv4</li>
</ul></li></ul></li></ul></li>
<li>autre niv1 alone</li>
</ul>
How do I form a "recursive list" like that:
[
('niv1alone',[]),('niv1',[('niv2',[('niv3',[('niv4',[])])])]),('autre niv1 alone',[])
]
I have tried several things with beautifulsoup but I can't get the desired result.
Here's a recursive function that functions similar to what you ask. Trick to writing recursive functions is to make the problem smaller then recurse it. Here I'm walking down the element tree and passing the children, which is a strictly smaller set than one before.
import bs4
html = '''
<ul>
<li>niv1alone</li>
<li>niv1
<ul>
<li>niv2
<ul>
<li>niv3
<ul>
<li>niv4</li>
</ul></li></ul></li></ul></li>
<li>autre niv1 alone</li>
</ul>
'''
def make_tree(body: bs4.Tag):
branch = []
for ch in body.children:
if isinstance(ch, bs4.NavigableString):
# skip whitespace
if str(ch).strip():
branch.append(str(ch).strip())
else:
branch.append(make_tree(ch))
return branch
if __name__ == '__main__':
soup = bs4.BeautifulSoup(html, 'html.parser')
tree = make_tree(soup.select_one('ul'))
print(tree)
output:
[['niv1alone'], ['niv1', [['niv2', [['niv3', [['niv4']]]]]]], ['autre niv1 alone']]
my url is http://example.com/en/cat/ap+da+w_pl
Now I have a-tag like this:
<a href="{{ url_for('category',
feature=request.path+"+"+att.get('u_sg'))}}">
{{ att.get('name') }}
</a>
request.path is giving me '/en/cat/ap+da+wh_pl' BUT, I need only /ap+da+w_pl
How to do it?
I need to pass only 'ap+da+w_pl' from out of request.path from HTML only, as I have to use it in pre-coded View of Flask and my view is like THIS:
#app.route('<lan_c>/cat/<string:feature>')
def category(feature, page):
Consider current url is 'http://example.com/en/cat/ap+da+w_pl'
if user click on a-tag then I want to append value returned from 'att.get('u_sg')'.
The problem I am facing right now is my a-tag is considering 'http://example.com/en/cat/en/cat/ap+da+w_pl+w_pl2', so I wanted to send only 'ap+da+w_pl' + 'att.get('u_sg')'. So that a-tag will point to 'http://example.com/en/cat/ap+da+w_pl+w_pl2'
You could split the result by / and get the last key:
>>> r = 'http://example.com/en/cat/ap+da+w_pl'.split('/')
>>> r[-1]
'ap+da+w_pl'
This would work for /en/cat/ap+da+wh_pl the same way:
>>> r = '/en/cat/ap+da+w_pl'.split('/')
>>> r[-1]
'ap+da+w_pl'
Prepend the / if needed:
>>> '/'+(r[-1])
'/ap+da+w_pl'
Im trying to build my own rss with downloadlinks
but the rss feed provides only the link to the whole season.
i'm taking that link to the whole season and want to extract the specific downloadlink to the episode itself (uploaded/ul)
this is what ive got so far.
any possibilities to get that working ?!
import feedparser, urllib2, re
from BeautifulSoup import BeautifulSoup
episodenliste = ['Game.of.Thrones','Arrow']
episode_link = []
episode_title = []
d = feedparser.parse('http://serienjunkies.org/xml/feeds/episoden.xml')
for post in d.entries:
if ('DEUTSCH' in post.title) and any (word in post.title for word in episodenliste) and ('720p' in post.title):
post.title = post.title.replace('[DEUTSCH] ','')
post.title = re.sub(r'(.*S\d+E\d+)(.*)',r'\1' ,post.title)
episode_link.append(post.link)
episode_title.append(post.title)
print post.title + ": " + post.link + "\n"
for search_title in episode_title:
for get_dlLink in episode_link:
page_ = urllib2.Request(get_dlLink)
page = urllib2.urlopen(page_).read()
soup = BeautifulSoup(page)
print search_title
title = soup.find('strong', text=search_title)
if title is not None:
print title
# link = title.parent
# links = link.find_all('a')
# print links
# for link2 in links:
# url = link2['href']
# print url
# pattern = 'http:\/\/download\.serienjunkies\.org.*%s_.*\.html' % ul
# if re.match(pattern, url):
# print url
as far as i can tell it works to that point where im searching the page for the title.
it gets to the pages parsed from the rss. but it doesnt find the title.
my idea was:
first find the title and than extract the 'children'/links from it
any help is appreciated
thanks in advance
Wihout JavaScript enabled the HTML looks quite different:
<p><strong>Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS</strong><br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-55bc328624d93658/fm_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | filemonkey.in<br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-25023a87144345f9/so_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | share-online.biz<br>
<strong>Download:</strong> <a target="_blank" href="http://download.serienjunkies.org/f-3e8ea978a2cf7bda/ul_tvs-arrow-dd51-ded-dl-7p-ithd-avc-214.html">hier</a> | uploaded.to</p>
As the title from the RSS feed without the [DEUTSCH] prefix is the first text in the paragraph on the page for the series, it can be the basis for searching and extracting the entry. Two elements up there is the <p> tag containing all the data for the episode. And that's links followed by the name of the file hoster.
import feedparser
import requests
from bs4 import BeautifulSoup
FEED_URL = 'http://serienjunkies.org/xml/feeds/episoden.xml'
def is_interesting_entry(entry, title_prefix, series_names):
return (
entry.title.startswith(title_prefix)
and any(name in entry.title for name in series_names)
)
def process_entry(entry, title_prefix):
if not entry.title.startswith(title_prefix):
raise ValueError(
'expected prefix {0!r} not found in {1!r}'.format(
title_prefix, entry.title
)
)
return (entry.title[len(title_prefix):], entry.link)
def process_feed(feed_url, title_prefix, series_names):
return (
process_entry(entry, title_prefix)
for entry in feedparser.parse(feed_url).entries
if is_interesting_entry(entry, title_prefix, series_names)
)
def get_series_soup(url, cache=dict()):
if url in cache:
return cache[url]
else:
result = BeautifulSoup(requests.get(url).text)
cache[url] = result
return result
def get_download_urls(soup, title):
title_text = soup.find(text=title)
if not title_text:
return dict()
else:
return dict(
(a_tag.next_sibling.strip('| '), a_tag['href'])
for a_tag in title_text.parent.parent('a')
)
def main():
series_names = ['Game.of.Thrones', 'Arrow']
for title, url in process_feed(FEED_URL, '[DEUTSCH] ', series_names):
print
print title
hoster2url = get_download_urls(get_series_soup(url), title)
if hoster2url:
for hoster, download_url in sorted(hoster2url.iteritems()):
print '{0:>20s}: {1}'.format(hoster, download_url)
else:
print ' --- No downloads ---'
if __name__ == '__main__':
main()
PASTEBIN
<item>
<title>[DEUTSCH] Arrow.S02E14.Gegen.die.Zeit.GERMAN.DUBBED.720p.HDTV.x264-ZZGtv</title>
<description>[DEUTSCH] Arrow.S02E14.Gegen.die.Zeit.GERMAN.DUBBED.720p.HDTV.x264-ZZGtv</description>
<pubDate>Fri, 18 Jul 2014 00:00:00 +0200</pubDate>
<link>http://serienjunkies.org/arrow/arrow-staffel-2-hdtvweb-dl-sd720p1080p/</link>
</item>
sorry, didnt know that
<p><strong>Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS</strong><br><div id="download_mirrors" class="download_main"><strong>Download:</strong> uploaded.net <span style="font-size:10px">(best speed) </span><br><strong style="margin-left:14px">Mirrors:</strong> <img src="http://serienjunkies.org/media/img/stream/application_cascade.png" style="cursor:pointer;" title="Mirrors zeigen" onclick="toggle("Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS");"><div id="Arrow.S02E14.Gegen.die.Zeit.German.DD51.Dubbed.DL.720p.iTunesHD.AVC-TVS" style="display: none;">
<strong style="margin-left:20px">Mirror:</strong> filemonkey.in<br>
<strong style="margin-left:20px">Mirror:</strong> share-online.biz<br>
</div><div><strong style="margin-left:18px">Usenet:</strong> Highspeed Mirror</div></div></p>
I am trying to pull list of data from website using Beautiful Soup:
class burger(webapp2.RequestHandler):
Husam = urlopen('http://www.qaym.com/city/77/category/3/%D8%A7%D9%84%D8%AE%D8%A8%D8%B1/%D8%A8%D8%B1%D8%AC%D8%B1/').read()
def get(self, soup = BeautifulSoup(Husam)):
tago = soup.find_all("a", class_ = "bigger floatholder")
for tag in tago:
me2 = tag.get_text("\n")
template_values = {
'me2': me2
}
for template in template_values:
template = jinja_environment.get_template('index.html')
self.response.out.write(template.render(template_values))
Now when I try to show the data in template using jinja2, but it's repeat the whole template based on the number of list and put each single info in one template.
How I put the the whole list in one tag and be able to edit other tags whith out repeating?
<li>{{ me2}}</li>
To output a list of entries, you can loop over them in your jinja2 template like this:
{%for entry in me2%}
<li> {{entry}} </li>
{% endfor %}
To use this, your python code also has to put the tags into a list.
Something like this should work:
def get(self, soup=BeautifulSoup(Husam)):
tago = soup.find_all("a", class_="bigger floatholder")
# Create a list to store your entries
values = []
for tag in tago:
me2 = tag.get_text("\n")
# Append each tag to the list
values.append(me2)
template = jinja_environment.get_template('index.html')
# Put the list of values into a dict entry for jinja2 to use
template_values = {'me2': values}
# Render the template with the dict that contains the list
self.response.out.write(template.render(template_values))
References:
Jinja2 template documentation