This is a code with Web crawler.
I'm a beginer in learning python.So i don't know how to solve.
It seems wrong with search()
# -*- coding:utf-8 -*-
import urllib,urllib2,re
class BDTB:
def __init__(self,baseUrl,seeLZ):
self.baseUrl = baseUrl
self.seeLZ = '?see_lz' + str(seeLZ)
def getPage(self,pageNum):
try:
url = self.baseUrl + self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
#print response.read().decode('utf-8')
return response
except urllib2.URLError,e:
if hasattr(e,'reason'):
print u'连接百度贴吧失败,错误原因',e.reason
return None
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class.*?px">(.*?)</h3>',re.S)
result = re.search(pattern,page)
if result:
print result.group(1)
return result.group(1).strip()
else:
return None
baseURL = 'http://tieba.baidu.com/p/4095047339'
bdtb = BDTB(baseURL,1)
bdtb.getTitle()
This will raise a TypeError: expected string or buffer because you are passing the object returned from urllib2.urlopen(request) to re.search() when it requires an str.
If you change the return value from:
return responce # returns the object
to one that returns the text contained in the request:
return responce.read() # returns the text contained in the responce
Your script works and after executing it returns:
广告兼职及二手物品交易集中贴
Additionally, since you're working with Python 2.x you might want to change you object from class BDTB: to class BDTB(object) in order to use new style classes.
Related
I am using requests library (python 3.9) to get filename from URL.[1] For some reason a file name is incorrectly encoded.
I should get "Ogłoszenie_0320.pdf" instead of "OgÅ\x82oszenie_0320.pdf".
My code looks something like this:
import requests
import re
def getFilenameFromRequest(url : str, headers):
# Parses from header information
contentDisposition = headers.get('content-disposition')
if contentDisposition:
filename = re.findall('filename=(.+)', contentDisposition)
print("oooooooooo: " + contentDisposition + " : " + str(filename))
if len(filename) != 0:
return filename[0]
# Parses from url
parsedUrl = urlparse(url)
return os.path.basename(parsedUrl.path)
def getFilenameFromUrl(url : str):
request = requests.head(url)
headers = request.headers
return getFilenameFromRequest(url, headers)
getFilenameFromUrl('https://przedszkolekw.bip.gov.pl'+
'/fobjects/download/880287/ogloszenie-uzp-nr-613234-pdf.html')
Any idea how to fix it?
I know for standard request I can set encoding directly:
request.encoding = 'utf-8'
But what am I supposed to do with this case?
[1]
https://przedszkolekw.bip.gov.pl/fobjects/download/880287/ogloszenie-uzp-nr-613234-pdf.html
Only characters from the ascii based latin-1 should be used as header values [rfc]. Here the file name has been escaped.
>>> s = "Ogłoszenie_0320.pdf"
>>> s.encode("utf8").decode("unicode-escape")
'OgÅ\x82oszenie_0320.pdf'
To reverse the process you can do
>>> sx = 'OgÅ\x82oszenie_0320.pdf'
>>> sx.encode("latin-1").decode("utf8")
'Ogłoszenie_0320.pdf'
(updated after conversation in comments)
I have defined a class in the controllers.py file to receive HTTP requests. The remote server sends a POST request and the data in the request body is a JSON string.
I can get the data in the request body directly by converting the JSON string to a dictionary via method http.request.jsonrequest, but for now, I need to get the original JSON string in the request body directly instead of a dictionary to verify a signature.
The method(json.dumps()) of directly converting to JSON strings cannot be used, as the string obtained in this way is not the same as the JSON string in the original request body, which can lead to a failure when verifying the signature.
What should I do about it? Please help me. Thank you.
this is my controllers.py
# -*- coding: utf-8 -*-
from odoo import http
class CallbackNotification(http.Controller):
def safety_judgement(self):
"""
:return:
"""
header = http.request.httprequest.headers.environ
signature = header['HTTP_X_TSIGN_OPEN_SIGNATURE']
time_stamp = header['HTTP_X_TSIGN_OPEN_TIMESTAMP']
remote_addr = http.request.httprequest.remote_addr
if remote_addr != '47.99.80.224':
return
#http.route('/signature/process/my_odoo', type='json', auth='none')
def receive_institution_auth(self, **kw):
"""
:param kw:
:return:
"""
self.safety_judgement()
request_body = http.request.jsonrequest
action = request_body['action']
flow_num = request_body['flowId']
http_env = http.request.env
sign_process_id = http_env['sign.process'].sudo().search([('flow_num', '=', flow_num)]).id
if action == 'SIGN_FLOW_UPDATE':
third_order = request_body['thirdOrderNo']
name_id_user_list = third_order.split(',')
model = name_id_user_list[0]
record_id = name_id_user_list[1]
approve_user_id = name_id_user_list[2]
if approve_user_id != 'p':
record_obj = http_env[model].sudo(user=int(approve_user_id)).browse(int(record_id))
sign_result = request_body['signResult']
result_description = request_body['resultDescription']
account_num = request_body['accountId']
org_or_account_num = request_body['authorizedAccountId']
sign_user_id = http_env['sign.users'].sudo().search([('account_num','=',account_num)]).id
http_manual_env = http_env['manual.sign'].sudo()
if account_num == org_or_account_num:
manual_id = http_manual_env.search([('sign_process_id','=',sign_process_id),
('sign_user_id','=',sign_user_id)]).id
else:
institution_id = http_env['institution.account'].sudo().search([('org_num','=',org_or_account_num)]).id
manual_id = http_manual_env.search([('sign_process_id', '=', sign_process_id),
('sign_user_id', '=', sign_user_id),
('institution_id','=',institution_id)]).id
if sign_result == 2:
http_manual_env.browse(manual_id).write({'sign_result':'success'})
http.request._cr.commit()
if approve_user_id != 'p':
record_obj.approve_action('approved','')
else:
http_env[model].sudo().browse(int(record_id)).write({'partner_sign_state':'success'})
elif sign_result == 3:
http_manual_env.browse(manual_id).write({'sign_result':'failed'})
if approve_user_id == 'p':
http_env[model].sudo().browse(int(record_id)).write({'partner_sign_state':'failed'})
elif sign_result == 4:
http_manual_env.browse(manual_id).write({'sign_result':'reject'})
http.request._cr.commit()
if approve_user_id != 'p':
record_obj.approve_action('reject', result_description)
else:
http_env[model].sudo().browse(int(record_id)).write({'partner_sign_state':'reject','partner_reject_reason':result_description})
To obtain a raw body data, you can use the following code:
raw_body_data = http.request.httprequest.data
pychong
You can send the value from JSON-RPC into your Json controller.
Js File:
Where pass the value before calling the controller like this.
var ajax = require('web.ajax');
ajax.jsonRpc("/custom/url", 'call', {'Your Key': Your Value}).then(function(data) {
if (data) {
// Code
} else {
// Code
}});
Py File :
Get the data from the post like this,
#http.route(['/custom/url'], type='json', auth="public", website=True)
def custom_cotroller(self, **post):
get_data = post.get('Your Key')
# Your Customise Code
Thanks
#Dipen Shah #CoolFlash95 #Charif DZ
Hello everyone,I've found a solution to this problem.But as I lay out the solution, I hope we can understand the root cause of the problem, so let's examine odoo's source code.
from odoo.http import JsonRequest--odoo version 10.0--line598
from odoo.http import JsonRequest--odoo version 11.0--line609
In Odoo10
request = self.httprequest.stream.read(),thenself.jsonrequest = json.loads(request)
In Odoo11
request=self.httprequest.get_data().decode(self.httprequest.charset),thenself.jsonrequest = json.loads(request)
We find that the self object of JsonRequest has the attribute jsonrequest that the type is dict.Unfortunately, the source code does not allow self to have 'another' attribute, which contains the original string in the request body.However,it is very easy to add the 'another' attribute,why not?
We can use setattr to dynamically change the methods in the source code.Let's change the method__init__of JsonRequest and add another attribute named stream_str.
eg.Odoo version is 10,python version is 2.7
# -*- coding: utf-8 -*-
import logging
from odoo.http import JsonRequest
import werkzeug
import json
_logger = logging.getLogger(__name__)
def __init__(self, *args):
"""
We have copied the method __init__ directly from the source code and added
only one line of code to it
"""
super(JsonRequest, self).__init__(*args)
self.jsonp_handler = None
args = self.httprequest.args
jsonp = args.get('jsonp')
self.jsonp = jsonp
request = None
request_id = args.get('id')
if jsonp and self.httprequest.method == 'POST':
# jsonp 2 steps step1 POST: save call
def handler():
self.session['jsonp_request_%s' % (request_id,)] = self.httprequest.form['r']
self.session.modified = True
headers = [('Content-Type', 'text/plain; charset=utf-8')]
r = werkzeug.wrappers.Response(request_id, headers=headers)
return r
self.jsonp_handler = handler
return
elif jsonp and args.get('r'):
# jsonp method GET
request = args.get('r')
elif jsonp and request_id:
# jsonp 2 steps step2 GET: run and return result
request = self.session.pop('jsonp_request_%s' % (request_id,), '{}')
else:
# regular jsonrpc2
request = self.httprequest.stream.read()
# We added this line of code,a new attribute named stream_str contains the origin string in request body when the request type is json.
self.stream_str = request
# Read POST content or POST Form Data named "request"
try:
self.jsonrequest = json.loads(request)
except ValueError:
msg = 'Invalid JSON data: %r' % (request,)
_logger.info('%s: %s', self.httprequest.path, msg)
raise werkzeug.exceptions.BadRequest(msg)
self.params = dict(self.jsonrequest.get("params", {}))
self.context = self.params.pop('context', dict(self.session.context))
# Replacing the __init__ method in the source code with the new __init__ method, but without changing the source code
setattr(JsonRequest, '__init__', __init__)
In the definition of the routing function, we can do this.
# -*- coding: utf-8 -*-
from odoo.http import Controller,route,request
class CallbackNotification(http.Controller):
#route('/signature/process/my_odoo', type='json', auth='none')
def receive_institution_auth(self, **kw):
# When the type='json',the request is the object of JsonRequest,we can get the new attribute stream_str very easy!
stream_str = request.stream_str
Now the problem has been solved.
I am working on Stock predicting project.I want to download historical data from yahoo finance and save them in CSV format.
Since I am beginner in Python I am unable to correct the error.
My code is as follows:
import re
import urllib2
import calendar
import datetime
import getopt
import sys
import time
crumble_link = 'https://finance.yahoo.com/quote/{0}/history?p={0}'
crumble_regex = r'CrumbStore":{"crumb":"(.*?)"}'
cookie_regex = r'Set-Cookie: (.*?); '
quote_link = 'https://query1.finance.yahoo.com/v7/finance/download/{}?period1={}&period2={}&interval=1d&events=history&crumb={}'
def get_crumble_and_cookie(symbol):
link = crumble_link.format(symbol)
response = urllib2.urlopen(link)
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
text = response.read()
match = re.search(crumble_regex, text)
crumble_str = match.group(1)
return crumble_str, cookie_str
def download_quote(symbol, date_from, date_to):
time_stamp_from = calendar.timegm(datetime.datetime.strptime(date_from, "%Y-%m-%d").timetuple())
time_stamp_to = calendar.timegm(datetime.datetime.strptime(date_to, "%Y-%m-%d").timetuple())
attempts = 0
while attempts < 5:
crumble_str, cookie_str = get_crumble_and_cookie(symbol)
link = quote_link.format(symbol, time_stamp_from, time_stamp_to, crumble_str)
#print link
r = urllib2.Request(link, headers={'Cookie': cookie_str})
try:
response = urllib2.urlopen(r)
text = response.read()
print "{} downloaded".format(symbol)
return text
except urllib2.URLError:
print "{} failed at attempt # {}".format(symbol, attempts)
attempts += 1
time.sleep(2*attempts)
return ""
if __name__ == '__main__':
print get_crumble_and_cookie('KO')
from_arg = "from"
to_arg = "to"
symbol_arg = "symbol"
output_arg = "o"
opt_list = (from_arg+"=", to_arg+"=", symbol_arg+"=")
try:
options, args = getopt.getopt(sys.argv[1:],output_arg+":",opt_list)
except getopt.GetoptError as err:
print err
for opt, value in options:
if opt[2:] == from_arg:
from_val = value
elif opt[2:] == to_arg:
to_val = value
elif opt[2:] == symbol_arg:
symbol_val = value
elif opt[1:] == output_arg:
output_val = value
print "downloading {}".format(symbol_val)
text = download_quote(symbol_val, from_val, to_val)
with open(output_val, 'wb') as f:
f.write(text)
print "{} written to {}".format(symbol_val, output_val)
And the Error message that I am getting is :
File "C:/Users/Murali/PycharmProjects/generate/venv/tcl/generate2.py", line
49, in <module>
print get_crumble_and_cookie('KO')
File "C:/Users/Murali/PycharmProjects/generate/venv/tcl/generate2.py", line
19, in get_crumble_and_cookie
cookie_str = match.group(1)
AttributeError: 'NoneType' object has no attribute 'group'
So how can we resolve this problem that has popped up?
Look at these two commands:
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
The first one takes the string response.info() does a regular expression search to match cookie_regex. Then match.group(1) is supposed to take the match from it. The problem however is that if you do a print match in between these commands, you'll see that the re.search() returned nothing. This means match.group() has nothing to "group", which is why it errors out.
If you take a closer look at response.info() (you could just add a print response.info() command in your script to see it), you'll see that there's a line in response code that starts with "set-cookie:", the code after which you're trying to capture. However, you have your cookie_regex string set to look for a line with "Set-Cookie:". Note the capital letters. When I change that string to all lower-case, the error goes away:
cookie_regex = r'set-cookie: (.*?); '
I did run into another error after that, where print "downloading {}".format(symbol_val) stops because symbol_val hasn't been defined. It seems that this variable is only declared and assigned when opt[2:] == symbol_arg:. So you may want to rewrite that part to cover all cases.
I am trying to pass a method attribute local_filename of get_img_url to ImgurDownload but getting the error
"AttributeError: ImgurDownload instance has no attribute 'local_filename'"
Is it possible to make local_filename available in the global scope so other methods can access it?
import re, os, glob, sys
import requests
from bs4 import BeautifulSoup
import pdb
import pprint
# imgur url pattern
imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?')
class ImgurDownload():
#local_filename = None
def __init__(self, link_url, target_subreddit, submissionid):
self.link_url = link_url
self.target_subreddit = target_subreddit
self.submissionid = submissionid
def download_image(self):
response = requests.get("{}".format(self.link_url))
if response.status_code == 200:
#pdb.set_trace()
#------------->2. local_filename is what i want to get from get_img_url <------------
print('Downloading %s...' % self.local_filename)
with open(self.local_filename, 'wb') as fo:
for chunk in response.iter_content(4096):
fo.write(chunk)
def get_img_url(self):
if "imgur.com/" not in self.link_url:
pass # skip non-imgur submissions
if len(glob.glob('reddit_%s_%s_*' % (self.target_subreddit, self.submissionid))) > 0:
pass # we've already downloaded files for this reddit submission
if 'http://i.imgur.com/' in self.link_url:
# The URL is a direct link to the image.
mo = imgurUrlPattern.search(self.link_url) # using regex here instead of BeautifulSoup because we are pasing a url, not html
imgurFilename = mo.group(2)
if '?' in imgurFilename:
# The regex doesn't catch a "?" at the end of the filename, so we remove it here.
self.imgurFilename = imgurFilename[:imgurFilename.find('?')]
#--------------------> 1. I want this instance var to be passed on to ^ download_image method <~-----------------------
self.local_filename = 'reddit_%s_%s_album_None_imgur_%s' % (self.target_subreddit, self.submissionid, imgurFilename)
self.download_image()
def print_self_dict(self):
pprint.pprint(self.__dict__)
sample = "http://i.imgur.com/yhemck6.jpg"
x = ImgurDownload('http://i.imgur.com/C5uIWlD.jpg','brogress','2cgwwn')
print x.download_image()
#x.print_self_dict()
self.local_filename is only defined if a) you call get_img_url first and b) the link contains imgur.com. In your example, you haven't done a).
When self.local_filename variable is accessed in download_image() method python will check for the local_filename instance variable. Yet because this variable was not set as an instance variable before it was accessed in download_image(), python will throw you an AttributeError.
We have assure that this variable is instantiated before accessing it. The best approach is to instantiate it inside the init(). Then when it comes to download_image(), it haves an initialized local_filename variable appended in to self.
This is my code, it contains no recursion, but it hits maximum recursion depth on first pickle...
Code:
#!/usr/bin/env python
from bs4 import BeautifulSoup
from urllib2 import urlopen
import pickle
# open page and return soup list
def get_page_startups(page_url):
html = urlopen(page_url).read()
soup = BeautifulSoup(html, "lxml")
return soup.find_all("div","startup item")
#
# Get certain text from startup soup
#
def get_name(startup):
return startup.find("a", "profile").string
def get_website(startup):
return startup.find("a", "visit")["href"]
def get_status(startup):
return startup.find("p","status").strong.string[8:]
def get_twitter(startup):
return startup.find("a", "comment").string
def get_high_concept_pitch(startup):
return startup.find("div","headline").find_all("em")[1].string
def get_elevator_pitch(startup):
startup_soup = BeautifulSoup(urlopen("http://startupli.st" + startup.find("a","profile")["href"]).read(),"lxml")
return startup_soup.find("p", "desc").string.rstrip().lstrip()
def get_tags(startup):
return startup.find("p","tags").string
def get_blog(startup):
try:
return startup.find("a","visit blog")["href"]
except TypeError:
return None
def get_facebook(startup):
try:
return startup.find("a","visit facebook")["href"]
except TypeError:
return None
def get_angellist(startup):
try:
return startup.find("a","visit angellist")["href"]
except TypeError:
return None
def get_linkedin(startup):
try:
return startup.find("a","visit linkedin")["href"]
except TypeError:
return None
def get_crunchbase(startup):
try:
return startup.find("a","visit crunchbase")["href"]
except TypeError:
return None
# site to scrape
BASE_URL = "http://startupli.st/startups/latest/"
# scrape all pages
for page_no in xrange(1,142):
startups = get_page_startups(BASE_URL + str(page_no))
# search soup and pickle data
for i, startup in enumerate(startups):
s = {}
s['name'] = get_name(startup)
s['website'] = get_website(startup)
s['status'] = get_status(startup)
s['high_concept_pitch'] = get_high_concept_pitch(startup)
s['elevator_pitch'] = get_elevator_pitch(startup)
s['tags'] = get_tags(startup)
s['twitter'] = get_twitter(startup)
s['facebook'] = get_facebook(startup)
s['blog'] = get_blog(startup)
s['angellist'] = get_angellist(startup)
s['linkedin'] = get_linkedin(startup)
s['crunchbase'] = get_crunchbase(startup)
f = open(str(i)+".pkl", "wb")
pickle.dump(s,f)
f.close()
print "Done " + str(page_no)
This is the content of 0.pkl after the exception is raised:
http://pastebin.com/DVS1GKzz Thousand lines long!
There's some HTML from the BASE_URL in the pickle... but I didn't pickle any html strings...
BeautifulSoup .string attributes aren't actually strings:
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup('<div>Foo</div>')
>>> soup.find('div').string
u'Foo'
>>> type(soup.find('div').string)
bs4.element.NavigableString
Try using str(soup.find('div').string) instead and see if it helps. Also, I don't think Pickle is really the best format here. JSON is much easier in this case.
Most likely pickle is doing recursion internally, and the file you are trying parse is to big. You could try to increase the limit of the number of recursions allowed.
import sys
sys.setrecursionlimit(10000)
This is however not recommended for any type of production ready application, as it may mask actual issue, but could help highlight issue(s) during debugging.
Pickle cannot handle BeautifulSoup nodes. Similar questions with some workarounds:
RuntimeError: maximum recursion depth exceeded with Python 3.2 pickle.dump
pickle.dump meet RuntimeError: maximum recursion depth exceeded in cmp