I have written code for calling the AlchemyLanguage API of Bluemix in Python. I need the keywords and entities, but it is only showing the first keyword and first entity for the text file. Where am I going wrong?
import requests
import urllib
import urllib2
def call_alchemy_api(text, API_KEY):
payload = {'outputMode':'json','extract':'entities,keywords','sentiment':'1','maxRetrieve':'1', 'url':'https://www.ibm.com/us-en/'}
payload['apikey'] = API_KEY
encoded_text = urllib.quote_plus(text)
payload['text'] = text
data = urllib.urlencode(payload)
url = 'https://gateway-a.watsonplatform.net/calls/text/TextGetCombinedData'
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
return response
if __name__ == "__main__":
api_key = 'xxxxxxxxxxxxxxxxxxxxxmyapi'
f = open('in0.txt','r')
text = f.read()
print text
response = call_alchemy_api(text, api_key)
print response.read()
Change the maxRetrieve keyword's value.
Example:
payload = {'outputMode':'json','extract':'entities,keywords','sentiment':'1','maxRetrieve':'3', 'url':'https://www.ibm.com/us-en/'}
API Link:
http://www.ibm.com/watson/developercloud/alchemy-language/api/v1/
Related
When i search for books with a single name(e.g bluets) my code works fine, but when I search for books that have two names or spaces (e.g white whale) I got an error(jinja2 synatx) how do I solve this error?
#app.route("/book", methods["GET", "POST"])
def get_books():
api_key =
os.environ.get("API_KEY")
if request.method == "POST":
book = request.form.get("book")
url =f"https://www.googleapis.com/books/v1/volumes?q={book}:keyes&key={api_key}"
response =urllib.request.urlopen(url)
data = response.read()
jsondata = json.loads(data)
return render_template ("book.html", books=jsondata["items"]
I tried to search for similar cases, and just found one solution, but I didn't understand it
Here is my error message
http.client.InvalidURL
http.client.InvalidURL: URL can't contain control characters. '/books/v1/volumes?q=white whale:keyes&key=AIzaSyDtjvhKOniHFwkIcz7-720bgtnubagFxS8' (found at least ' ')
Some chars in url need to be encoded - in your situation you have to use + or %20 instead of space.
This url has %20 instead of space and it works for me. If I use + then it also works
import urllib.request
import json
url = 'https://www.googleapis.com/books/v1/volumes?q=white%20whale:keyes&key=AIzaSyDtjvhKOniHFwkIcz7-720bgtnubagFxS8'
#url = 'https://www.googleapis.com/books/v1/volumes?q=white+whale:keyes&key=AIzaSyDtjvhKOniHFwkIcz7-720bgtnubagFxS8'
response = urllib.request.urlopen(url)
text = response.read()
data = json.loads(text)
print(data)
With requests you don't even have to do it manually because it does it automatically
import requests
url = 'https://www.googleapis.com/books/v1/volumes?q=white whale:keyes&key=AIzaSyDtjvhKOniHFwkIcz7-720bgtnubagFxS8'
r = requests.get(url)
data = r.json()
print(data)
You may use urllib.parse.urlencode() to make sure all chars are correctly encoded.
import urllib.request
import json
payload = {
'q': 'white whale:keyes',
'key': 'AIzaSyDtjvhKOniHFwkIcz7-720bgtnubagFxS8',
}
query = urllib.parse.urlencode(payload)
url = 'https://www.googleapis.com/books/v1/volumes?' + query
response = urllib.request.urlopen(url)
text = response.read()
data = json.loads(text)
print(data)
and the same with requests - it also doesn't need encoding
import requests
payload = {
'q': 'white whale:keyes',
'key': 'AIzaSyDtjvhKOniHFwkIcz7-720bgtnubagFxS8',
}
url = 'https://www.googleapis.com/books/v1/volumes'
r = requests.get(url, params=payload)
data = r.json()
print(data)
I have been trying to get spacfic pages extract from each pdf and then merge all the extracted pdf in once.
I have list of pdfs
I am using pdfrw this library but getting error while extracting the pages
from pdfrw import PdfReader, PdfWriter
import os
files = [f for f in os.listdir(
'.') if os.path.isfile(f) and f.endswith('.pdf')]
print(files)
for pdf in files:
pages = PdfReader(pdf).pages
parts = [(6, 7)]
for part in parts:
title = pdf.title().split('.')[0]
outdata = PdfWriter(f'{title}_{part[0]}_.pdf')
for pagenum in range(*part):
outdata.addpage(pages[pagenum-1])
outdata.write()
Please help if possible
raise PdfParseError('Invalid PDF header: %s' %
pdfrw.errors.PdfParseError: Invalid PDF header: '<!doctype html>'
Manas,
One way to achieve your requirement is to use API. For example, consider following code snippet where it splits PDF from uploaded file.
import os
import requests # pip install requests
# The authentication key (API Key).
# Get your own by registering at https://app.pdf.co
API_KEY = "*********************************"
# Base URL for PDF.co Web API requests
BASE_URL = "https://api.pdf.co/v1"
# Source PDF file
SourceFile = ".\\sample.pdf"
# Comma-separated list of page numbers (or ranges) to process. Example: '1,3-5,7-'.
Pages = "1-2,3-"
def main(args = None):
uploadedFileUrl = uploadFile(SourceFile)
if (uploadedFileUrl != None):
splitPDF(uploadedFileUrl)
def splitPDF(uploadedFileUrl):
"""Split PDF using PDF.co Web API"""
# Prepare requests params as JSON
# See documentation: https://apidocs.pdf.co
parameters = {}
parameters["pages"] = Pages
parameters["url"] = uploadedFileUrl
# Prepare URL for 'Split PDF' API request
url = "{}/pdf/split".format(BASE_URL)
# Execute request and get response as JSON
response = requests.post(url, data=parameters, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
# Download generated PNG files
part = 1
for resultFileUrl in json["urls"]:
# Download Result File
r = requests.get(resultFileUrl, stream=True)
localFileUrl = f"Page{part}.pdf"
if r.status_code == 200:
with open(localFileUrl, 'wb') as file:
for chunk in r:
file.write(chunk)
print(f"Result file saved as \"{localFileUrl}\" file.")
else:
print(f"Request error: {response.status_code} {response.reason}")
part = part + 1
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")
def uploadFile(fileName):
"""Uploads file to the cloud"""
# 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
# Prepare URL for 'Get Presigned URL' API request
url = "{}/file/upload/get-presigned-url?contenttype=application/octet-stream&name={}".format(
BASE_URL, os.path.basename(fileName))
# Execute request and get response as JSON
response = requests.get(url, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
# URL to use for file upload
uploadUrl = json["presignedUrl"]
# URL for future reference
uploadedFileUrl = json["url"]
# 2. UPLOAD FILE TO CLOUD.
with open(fileName, 'rb') as file:
requests.put(uploadUrl, data=file, headers={ "x-api-key": API_KEY, "content-type": "application/octet-stream" })
return uploadedFileUrl
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")
return None
if __name__ == '__main__':
main()
Now, to merge PDF file you can use similar to following code snippet.
import os
import requests # pip install requests
# The authentication key (API Key).
# Get your own by registering at https://app.pdf.co
API_KEY = "**********************************"
# Base URL for PDF.co Web API requests
BASE_URL = "https://api.pdf.co/v1"
# Source PDF files
SourceFile_1 = ".\\sample1.pdf"
SourceFile_2 = ".\\sample2.pdf"
# Destination PDF file name
DestinationFile = ".\\result.pdf"
def main(args = None):
UploadedFileUrl_1 = uploadFile(SourceFile_1)
UploadedFileUrl_2 = uploadFile(SourceFile_2)
if (UploadedFileUrl_1 != None and UploadedFileUrl_2!= None):
uploadedFileUrls = "{},{}".format(UploadedFileUrl_1, UploadedFileUrl_2)
mergeFiles(uploadedFileUrls, DestinationFile)
def mergeFiles(uploadedFileUrls, destinationFile):
"""Perform Merge using PDF.co Web API"""
# Prepare requests params as JSON
# See documentation: https://apidocs.pdf.co
parameters = {}
parameters["name"] = os.path.basename(destinationFile)
parameters["url"] = uploadedFileUrls
# Prepare URL for 'Merge PDF' API request
url = "{}/pdf/merge".format(BASE_URL)
# Execute request and get response as JSON
response = requests.post(url, data=parameters, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
# Get URL of result file
resultFileUrl = json["url"]
# Download result file
r = requests.get(resultFileUrl, stream=True)
if (r.status_code == 200):
with open(destinationFile, 'wb') as file:
for chunk in r:
file.write(chunk)
print(f"Result file saved as \"{destinationFile}\" file.")
else:
print(f"Request error: {response.status_code} {response.reason}")
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")
def uploadFile(fileName):
"""Uploads file to the cloud"""
# 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
# Prepare URL for 'Get Presigned URL' API request
url = "{}/file/upload/get-presigned-url?contenttype=application/octet-stream&name={}".format(
BASE_URL, os.path.basename(fileName))
# Execute request and get response as JSON
response = requests.get(url, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
# URL to use for file upload
uploadUrl = json["presignedUrl"]
# URL for future reference
uploadedFileUrl = json["url"]
# 2. UPLOAD FILE TO CLOUD.
with open(fileName, 'rb') as file:
requests.put(uploadUrl, data=file, headers={ "x-api-key": API_KEY, "content-type": "application/octet-stream" })
return uploadedFileUrl
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")
return None
if __name__ == '__main__':
main()
In this sample I am using pdf.co API. Refer to following links for more information.
https://apidocs.pdf.co/30-pdf-split, https://apidocs.pdf.co/31-pdf-merge
Thanks!
import requests
import json
import jsonpath
def test_add_new_data():
# url = "http://thetestingworldapi.com/api/studentsDetails"
# f = open('/Users/sunghunkwak/PycharmProjects/apiTesting/postStudent.json', 'r')
# requests_post_id = json.loads(f.read())
# result = requests.post(url, requests_post_id)
# assert result.status_code == 201
# id = jsonpath.jsonpath(result.json(), 'id')
# print(id[0])
url_tech = "http://thetestingworldapi.com/api/technicalskills"
f = open('/Users/sunghunkwak/PycharmProjects/apiTesting/postTechskills.json', 'r')
requests_post_tech = json.loads(f.read())
result = requests.post(url_tech, requests_post_tech)
assert result.status_code == 200
print(result.text)
url_addr = "http://thetestingworldapi.com/api/addresses"
f = open('/Users/sunghunkwak/PycharmProjects/apiTesting/postAddress.json', 'r')
requests_post_addr = json.loads(f.read())
result = requests.post(url_addr, requests_post_addr)
print(result.status_code)
# assert result.status_code == 200
url_final = "http://thetestingworldapi.com/api/FinalStudentDetails/189969"
requests_get = requests.get(url_final)
print(requests_get.text)
I tried to test, but it keeps getting errors like below.
test.py {"status":"true","msg":"Add data success"}
500
{"Message":"An error has occurred."}
post address and get final data are getting errors.
How can I solve it??
Thank you.
Are you sure your JSON files are correct that you're trying to post? Here is an example post request:
import requests
url = 'https://reqres.in/api/users'
payload = {
"name": "foo",
"job": "bar"
}
x = requests.post(url, data=payload)
print(x.text)
When I tried to change the "URL" the URL to "http://thetestingworldapi.com/api/addresses" also throws the same error {"Message":"An error has occurred."}
Here is an another example I read it from JSON file and requested:
import requests
import os
import json
url = 'https://reqres.in/api/users'
filename = "foo.json"
folder_ = os.path.dirname(os.path.abspath(__file__))
absolute_filename = os.path.join(folder_, filename)
f = open(absolute_filename, 'r')
payload = json.loads(f.read())
x = requests.post(url, payload)
print(x.text)
Looks like you have an error in server side.
Also I suggest you read HTTP response status codes:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
I've tried two completely different methods. But still I can't get the data that is only present after loggin in.
I've tried doing one using requests but the xpath returns a null
import requests
from lxml import html
USERNAME = "xxx"
PASSWORD = "xxx"
LOGIN_URL = "http://www.reginaandrew.com/customer/account/loginPost/referer/aHR0cDovL3d3dy5yZWdpbmFhbmRyZXcuY29tLz9fX19TSUQ9VQ,,/"
URL = "http://www.reginaandrew.com/gold-leaf-glass-top-table"
def main():
FormKeyTxt = ""
session_requests = requests.session()
# Get login csrf token
result = session_requests.get(LOGIN_URL)
tree = html.fromstring(result.text)
# Create payload
formKey = str((tree.xpath("//*[ # id = 'login-form'] / input / # value")))
FormKeyTxt = "".join(formKey)
#print(FormKeyTxt.replace("['","").replace("']",""))
payload = {
"login[username]": USERNAME,
"login[password]": PASSWORD,
"form_key": FormKeyTxt,
"persistent_remember_me": "checked"
}
# Perform login
result = session_requests.post(LOGIN_URL, data=payload)
# Scrape url
result = session_requests.get(URL, data=payload)
tree = html.fromstring(result.content)
bucket_names = tree.xpath("//span[contains(#class, 'in-stock')]/text()")
print(bucket_names)
print(result)
print(result.status_code)
if __name__ == '__main__':
main()
ive tried another one using Mechanical soup but still it returns a null
import argparse
import mechanicalsoup
import urllib.request
from bs4 import BeautifulSoup
parser = argparse.ArgumentParser(description='Login to GitHub.')
parser.add_argument("username")
parser.add_argument("password")
args = parser.parse_args()
browser = mechanicalsoup.Browser()
login_page = browser.get("http://www.reginaandrew.com/gold-leaf-glass-top-table")
login_form = login_page.soup.select("#login-form")[0]
login_form.input({"login[username]": args.username, "login[password]": args.password})
page2 = browser.submit(login_form,login_page.url )
messages = page2.soup.find(class_='in-stock1')
if messages:
print(messages.text)
print(page2.soup.title.text)
I understand the top solution better so id like to do it using that but is there anything I'm missing? (I'm sure I'm missing a lot)
This should do it
import requests
import re
url = "http://www.reginaandrew.com/"
r = requests.session()
rs = r.get(url)
cut = re.search(r'<form.+?id="login-form".+?<\/form>', rs.text, re.S|re.I).group()
action = re.search(r'action="(.+?)"', cut).group(1)
form_key = re.search(r'name="form_key".+?value="(.+?)"', cut).group(1)
payload = {
"login[username]": "fugees",
"login[password]": "nugees",
"form_key": form_key,
"persistent_remember_me": "on"
}
rs = r.post(action, data=payload, headers={'Referer':url})
I am requesting an Ajax Web site with a Python script and fetching cities and branch offices of http://www.yurticikargo.com/bilgi-servisleri/Sayfalar/en-yakin-sube.aspx
I completed the first step with posting
{cityID: 34} to this url and fetc the JSON output.
http://www.yurticikargo.com/_layouts/ArikanliHolding.YurticiKargo.WebSite/ajaxproxy-sswservices.aspx/GetTownByCity
But I can not retrive the JSON output with Python although i get succesfully with Chrome Advanced Rest Client Extension, posting {cityID:54,townID:5416,unitOnDutyFlag:null,closestFlag:2}
http://www.yurticikargo.com/_layouts/ArikanliHolding.YurticiKargo.WebSite/ajaxproxy-unitservices.aspx/GetUnit
All of the source code is here
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import json
class Yurtici(object):
baseUrl = 'http://www.yurticikargo.com/'
ajaxRoot = '_layouts/ArikanliHolding.YurticiKargo.WebSite/ajaxproxy-sswservices.aspx/'
getTown = 'GetTownByCity'
getUnit = 'GetUnit'
urlGetTown = baseUrl + ajaxRoot + getTown
urlGetUnit = baseUrl + ajaxRoot + getUnit
headers = {'content-type': 'application/json','encoding':'utf-8'}
def __init__(self):
pass
def ilceler(self, plaka=34): # Default testing value
payload = {'cityId':plaka}
url = self.urlGetTown
r = requests.post(url, data=json.dumps(payload), headers=self.headers)
return r.json() # OK
def subeler(self, ilceNo=5902): # Default testing value
# 5902 Çerkezköy
payload= {'cityID':59,'townID':5902,'unitOnDutyFlag':'null','closestFlag':0}
url = self.urlGetUnit
headers = {'content-type': 'application/json','encoding':'utf-8'}
r = requests.post(url, data=json.dumps(payload), headers=headers)
print r.status_code, r.raw.read()
if __name__ == '__main__':
a = Yurtici()
print a.ilceler(37) # OK
print a.subeler() # NOT OK !!!
Your code isn't posting to the same url you're using in your text example.
Let's walk through this backwards. First, let's look at the failing POST.
url = self.urlGetUnit
headers = {'content-type': 'application/json','encoding':'utf-8'}
r = requests.post(url, data=json.dumps(payload), headers=headers)
So we're posting to a URL that is equal to self.urlGetUnit. Ok, let's look at how that's defined:
baseUrl = 'http://www.yurticikargo.com/'
ajaxRoot = '_layouts/ArikanliHolding.YurticiKargo.WebSite/ajaxproxy-sswservices.aspx/'
getUnit = 'GetUnit'
urlGetUnit = baseUrl + ajaxRoot + getUnit
If you do the work in urlGetUnit, you get that the URL will be http://www.yurticikargo.com/_layouts/ArikanliHolding.YurticiKargo.WebSite/ajaxproxy-sswservices.aspx/GetUnit. Let's put this alongside the URL you used in Chrome to compare the differences:
http://www.yurticikargo.com/_layouts/ArikanliHolding.YurticiKargo.WebSite/ajaxproxy-sswservices.aspx/GetUnit
http://www.yurticikargo.com/_layouts/ArikanliHolding.YurticiKargo.WebSite/ajaxproxy-unitservices.aspx/GetUnit
See the difference? ajaxRoot is not the same for both URLs. Sort that out and you'll get back a JSON response.