How to convert webscraping into django api? - python

I'm trying to scrape some data on two websites. I successfully scraped it. I also want to develop an API using this scraped data using Django. But when I try to display the scraped data in JSON format in Django. It only shows an empty list. Below I attached my code snippets.
from django.shortcuts import render
from bs4 import BeautifulSoup
import requests
import re
import json
import time
data = []
def getURL(url):
url = url.replace(' ', '-').lower()
for char in url:
if char in "?.!:;|/[]&()":
url = url.replace(char, '-')
if char == "'" or char == ",":
url = url.replace(char, '')
decodeUrl = re.sub(r'-+', '-', url)
# check whether the URL is up or not
parsedUrl = "http://www.tutorialbar.com/" + decodeUrl + "/"
if requests.head(parsedUrl).status_code == 200:
return parsedUrl
urls = ['https://www.discudemy.com/all', 'https://www.discudemy.com/all/2']
for url in urls:
source = requests.get(url).text
soup = BeautifulSoup(source, 'html5lib')
# print(soup)
for content in soup.find_all('section', class_="card"):
# print(content)
try:
language = content.label.text
header = content.div.a.text
day = content.find('span', class_="category").text
i = content.find('div', class_="image")
img = i.find('amp-img')['src']
image = img.replace('240x135', '750x422')
description = content.find('div', class_="description").text.lstrip()
myURL = getURL(header)
udemyURL = requests.get(myURL).text
udemySoup = BeautifulSoup(udemyURL, 'html5lib')
udemylink = udemySoup.find_all('a', class_="btn_offer_block re_track_btn")[0]["href"]
entry = {
'language': language,
'header': header,
'day': day,
'image': image,
'description': description,
'courselink': udemylink,
}
data.append(entry)
print()
except Exception as e:
continue
print(json.dumps(data))
print()
print(data)
def index(req):
return render(req, 'index.html', {'courses': json.dumps(data)})
Below is my HTML file for displaying JSON data.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>UdemyCourses</title>
</head>
<body>
{{ courses }}
</body>
</html>
There is some delay in scraping data. I think it might be a problem. I don't know how to handle asynchronous programming in python. Is there any way to achieve it? I'm a beginner. Help me out. Thanks in advance

Related

How to preserve spaces and indents while parsing a html file with bs4?

I have a bookmarks file exported from Chrome. The contents looks like this below
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!-- This is an automatically generated file.
It will be read and overwritten.
DO NOT EDIT! -->
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
<DT><H3 ADD_DATE="1598912308" LAST_MODIFIED="1622706295" PERSONAL_TOOLBAR_FOLDER="true">Bookmarks bar</H3>
<DL><p>
<DT><H3 ADD_DATE="1599868499" LAST_MODIFIED="1622998728">情報工学</H3>
<DL><p>
<DT>HackHub
<DT>Nifty Assignments
<DT>USACO
...
What I want to do is to modify links (href) inside <a> tags.
My code looks like this
import re
from bs4 import BeautifulSoup
BOOKMARKS_FILE = 'bookmarks_6_8_21.html'
BASE_URLS = [
'https://somepage.com/id/'
]
with open(BOOKMARKS_FILE, 'r', encoding='utf8') as f:
soup = BeautifulSoup(f, 'html.parser')
def get_uuid(int_id):
uuid = int_id + 1
return uuid
for link in soup.find_all('a'):
url = link['href']
for base_url in BASE_URLS:
if url.startswith(base_url):
id = int(re.search(r'\d+', url).group()) # extract an integer id after https://somepage.com/id/...
uuid = get_uuid(id)
new_url = base_url + uuid
print(f'New url: {new_url}', end='\n\n')
link['href'] = new_url
with open('Modified_' + BOOKMARKS_FILE, 'w', encoding='utf8') as f:
f.write(str(soup))
Unfortunately, the modified bookmarks file doesn't contain the original indentation. So when I tried to import it to Chrome, all folders I created are gone. Is there any way I can preserve indentation from the original file?
Using #smci's link, I decided to gave it a shot. I replaced this line
with open(BOOKMARKS_FILE, 'r', encoding='utf8') as f:
soup = BeautifulSoup(f, 'html.parser')
with
with open(BOOKMARKS_FILE, 'r', encoding='utf8') as f:
htm = f.read()
htm = re.sub(r'(\</?)(DL)(\>)', r'\1pre\3', htm)
htm = re.sub(r'(\</?)(p)(\>)', r'\1pre\3', htm)
soup = BeautifulSoup(htm, 'html.parser')
with open('Modified_xx_' + BOOKMARKS_FILE, 'w', encoding='utf8') as f:
f.write(str(soup))
I checked the output file and I still don't get the result I expected. The bottom of the output file is a mess
<dt><a add_date="1600302193" href="javascript:void(window.open('https://web.archive.org/web/*/'+document.location.href));">Wayback</a>
<dt><a add_date="1600302244" href="javascript:void(window.open('https://web.archive.org/save/'+location.href));">Wayback Save</a>
</dt></dt></dt></dt></pre><pre>
</pre><pre>
</pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre></dt></pre></pre>

WebScraping using python

I'm scraping all the comments from https://www.consumeraffairs.com/privacy/transunion.html website
page_list = []
def pagination(soup):
for i in range(0,32):
domain = "https://www.consumeraffairs.com/privacy/transunion.html?page="+str(i)
page_list.append(domain)
return page_list
pages = pagination(soup)
print(pages)
how to capture the comments under these pages as it shows
import time
comment_list = []
def get_comments(urls):
for url in urls:
try:
print(url)
#comment = soup.find_all('div',{'class':'rvw-bd'})
comment = soup.find_all('div',{'class':'rvw-bd'})
print(len(comment))
for x in range(len(comment)):
comment_list.append(comment[x].p.text.strip())
except:
continue
time.sleep(30)
return comment_list
comments = get_comments(pages)
I used this code but it scraps only first 10 in first page. how to fix this
I think you were on the right track changing the "page=" value in the url, but from the code you posted, it doesn't seem like you changed the soup object to represent the content of each new page. I rewrote some of your code to do this:
from bs4 import BeautifulSoup
import requests
import time
page_list = []
for i in range(0,32):
domain = "https://www.consumeraffairs.com/privacy/transunion.html?page="+str(i)
page_list.append(domain)
comment_list = []
for page in page_list:
try:
content = requests.get(page).content
soup = BeautifulSoup(content, 'html.parser')
#comment = soup.find_all('div',{'class':'rvw-bd'})
comment = soup.find_all('div',{'class':'rvw-bd'})
print(len(comment))
for x in range(len(comment)):
comment_list.append(comment[x].p.text.strip())
except:
continue
time.sleep(30)
print(comment_list)
print(len(comment_list))
Let me know if this does/doesn't help!

Getting javascript variable value while scraping with python

I know this is asked before also, but I am a newbie in scraping and python. Please help me and it would be very much helpful in my learning path.
I am scraping a news site using python with packages such as Beautiful Soup and etc.
I am facing difficulty while getting the value of java script variable which is declared in script tag and also it is getting updated there.
Here is the part of HTML page which I am scraping:(containing only script part)
<!-- Eliminate render-blocking JavaScript and CSS in above-the-fold content -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.8/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/0.97.0/js/materialize.min.js"></script>
<script type="text/javascript" src="/dist/scripts/index.js"></script>
<script type="text/javascript" src="/dist/scripts/read.js"></script>
<script src="/dist/scripts/jquery.scrolldepth.min.js"></script>
<script type="text/javascript">
var min_news_id = "d7zlgjdu-1"; // line 1
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'politics','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id; // line 2
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
</script>
From the above part, I want to get the value of min_news_id in python.
I should also get the value of same variable if updated from line 2.
Here is how I am doing it:
self.pattern = re.compile('var min_news_id = (.+?);') // or self.pattern = re.compile('min_news_id = (.+?);')
page = bs(htmlPage, "html.parser")
//find all the scripts tag
scripts = page.find_all("script")
for script in scripts:
for line in script:
scriptString = str(line)
if "min_news_id" in scriptString:
scriptString.replace('"', '\\"')
print(scriptString)
if(self.pattern.match(str(scriptString))):
print("matched")
data = self.pattern.match(scriptString)
jsVariable = json.loads(data.groups()[0])
InShortsScraper.newsOffset = jsVariable
print(InShortsScraper.newsOffset)
But I am never getting the value of the variable. Is it problem with my regular expression or any other? Please help me.
Thank You in advance.
html = '''<!-- Eliminate render-blocking JavaScript and CSS in above-the-fold content -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.8/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/0.97.0/js/materialize.min.js"></script>
<script type="text/javascript" src="/dist/scripts/index.js"></script>
<script type="text/javascript" src="/dist/scripts/read.js"></script>
<script src="/dist/scripts/jquery.scrolldepth.min.js"></script>
<script type="text/javascript">
var min_news_id = "d7zlgjdu-1"; // line 1
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'politics','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id; // line 2
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
</script>'''
finder = re.findall(r'min_news_id = .*;', html)
print(finder)
Output:
['min_news_id = "d7zlgjdu-1";', 'min_news_id = data.min_news_id||min_news_id;']
#2 OR YOU CAN USE
print(finder[0].replace('min_news_id = ', '').replace('"','').replace(';','').strip())
Output:
d7zlgjdu-1
#3 OR YOU CAN USE
finder = re.findall(r'[a-z0-9]{8}-[0-9]', html)
print(finder)
Output:
['d7zlgjdu-1']
you can't monitor javascript variable change using BeautifulSoup, here how to get next page news using while loop, re and json
from bs4 import BeautifulSoup
import requests, re
page_url = 'https://inshorts.com/en/read/politics'
ajax_url = 'https://inshorts.com/en/ajax/more_news'
htmlPage = requests.get(page_url).text
# BeautifulSoup extract article summary
# page = BeautifulSoup(htmlPage, "html.parser")
# ...
# get current min_news_id
min_news_id = re.search('min_news_id\s+=\s+"([^"]+)', htmlPage).group(1) # result: d7zlgjdu-1
customHead = {'X-Requested-With': 'XMLHttpRequest', 'Referer': page_url}
while min_news_id:
# change "politics" if in different category
reqBody = {'category' : 'politics', 'news_offset' : min_news_id }
# get Ajax next page
ajax_response = requests.post(ajax_url, headers=customHead, data=reqBody).json() # parse string to json
# again, do extract article summary
page = BeautifulSoup(ajax_response["html"], "html.parser")
# ....
# ....
# new min_news_id
min_news_id = ajax_response["min_news_id"]
# remove this to loop all page (thousand?)
break
thank you for the response, Finally I solved using requests package after reading its documentation,
here is my code :
if InShortsScraper.firstLoad == True:
self.pattern = re.compile('var min_news_id = (.+?);')
else:
self.pattern = re.compile('min_news_id = (.+?);')
page = None
# print("Pattern: " + str(self.pattern))
if news_offset == None:
htmlPage = urlopen(url)
page = bs(htmlPage, "html.parser")
else:
self.loadMore['news_offset'] = InShortsScraper.newsOffset
# print("payload : " + str(self.loadMore))
try:
r = myRequest.post(
url = url,
data = self.loadMore
)
except TypeError:
print("Error in loading")
InShortsScraper.newsOffset = r.json()["min_news_id"]
page = bs(r.json()["html"], "html.parser")
#print(page)
if InShortsScraper.newsOffset == None:
scripts = page.find_all("script")
for script in scripts:
for line in script:
scriptString = str(line)
if "min_news_id" in scriptString:
finder = re.findall(self.pattern, scriptString)
InShortsScraper.newsOffset = finder[0].replace('min_news_id = ', '').replace('"','').replace(';','').strip()

extracting url using beautifulsoup

With this code:
url = "https://github.com/searcho=desc&p=1&q=stars%3A%3E1&s=stars&type=Repositoris"
with urllib.request.urlopen(url) as response:
html = response.read()
html = html.decode('utf-8')
with open('page_content.html', 'w', encoding='utf-8') as new_file:
new_file.write(html)
soup = BeautifulSoup(html,'lxml')
g_data= soup.findAll("a", {"class":"v-align-middle"})
print(g_data[0])
The output is:
<a class="v-align-middle" data-hydro-click='{"event_type":"search_result.click","payload":{"page_number":1,"query":"stars:>1","result_position":1,"click_id":28457823,"result":{"id":28457823,"global_relay_id":"MDEwOlJlcG9zaXRvcnkyODQ1NzgyMw==","model_name":"Repository","url":"https://github.com/freeCodeCamp/freeCodeCamp"},"originating_request_id":"ECC6:1DF24:CE9C0F:1667572:5A8DDD6F"}}' data-hydro-hmac="42c4e038b86cefc302d5637e870e6d746ee7fa95eadf2b26930cb893c6a3bc53" href="/freeCodeCamp/freeCodeCamp">freeCodeCamp/freeCodeCamp</a>
How do I extract below url from the output:
https://github.com/freeCodeCamp/freeCodeCamp
Thanks!
Get the value of the attribute, json.loads() it and work with it as a regular python dict:
import json
# your other code, up to setting the g_data
data_hydro = g_data[0]['data-hydro-click']
data_hydro = json.loads(data_hydro)
print(data_hydro['payload']['result']['url'])
It's inside a json string, that's why it's hard to get at
html = """
<h3>
freeCodeCamp/freeCodeCamp
</h3>
"""
soup = BeautifulSoup(html, 'lxml')
parsed_json = json.loads(soup.a.get('data-hydro-click'))
parsed_json['payload']['result']['url']
# returns 'https://github.com/freeCodeCamp/freeCodeCamp'

Python Wordpress Xmlrpc 'Post_tags' not separated by comma

In my task to learn python I decided to start a project that could help me in some way, by automating manual tasks that I do.
I started coding a web scraper using beautifulsoup that would pull content from another website and then post it to a wordpress blog of my own. The data pulled includes Post title, tags, images.
I managed to get most of it to work (my code is probably terrible) , but ran into a snag when it came to the tags.
#Getting tags
def get_tags():
for url in modified_lines:
SoupRequest(url)
global tags
global cats
cats = []
tags = [tag['content'] for tag in soup("meta", attrs={"property": "article:tag"})]
categories = soup.find('span', attrs={'class': 'cat-meta-color'}).findAll('a',limit=limiter)
allcats = [cat.text for cat in categories]
for singlecat in allcats:
cats.append(singlecat)
tags.extend(cats)
strtag = tags
with io.open('tags.txt', 'a', encoding='utf8') as tagfile:
posttag ="".join(str(strtag))
tagfile.write(posttag[1:-1].replace("'","") + '\n')
get_tags()
with open('tags.txt') as t:
taglists = list(t.readlines())
#Posting to wordpress
def post_to_wordpress(title, allvideoLinks,alltags,images):
filepaths = images.strip()
filename = filepaths.split('/')[-1]
data = {
'name': filename,
'type': mimetypes.MimeTypes().guess_type(filename)[0], # mimetype
}
with open(filepaths, 'rb') as img:
data['bits'] = xmlrpc_client.Binary(img.read())
response = wp.call(media.UploadFile(data))
attachment_id = response['id']
wp.call(GetPosts())
title = str(title)
content = allvideoLinks
post = WordPressPost()
post.title = title
post.thumbnail = attachment_id
post.content = content
post.terms_names = {
'post_tag': [alltags],
'category': ['Youtube'],
}
post.post_type = "post"
post.post_status = 'publish'
post.id = wp.call(NewPost(post))
wp.call(posts.EditPost(post.id, post))
for l,y,t,f in zip(scrubbed,scrummed,taglists,images):
post_to_wordpress(l,y,t,f)
time.sleep(3)
While the tags do post, they seem to come attached as a string instead of individual list items , Example
Would appreciate any guidance on how to solve this issue

Categories

Resources