Azure Cognitive Services: Problem with Text Analytics PII Endpoint in Python SDK - python

I'm trying to perform some more in-depth PII detection as the standard code that might be found here: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/quickstart?pivots=programming-language-python fails to find some more detailed entities (like French registration plates number, for example).
Everything works fine when I use the standard endpoint: 'https://whatever.cognitiveservices.azure.com/'
However, when I switch to 'https://whatever.cognitiveservices.azure.com/text/analytics/v3.1/entities/recognition/pii?piiCategories=default,FRDriversLicenseNumber" (an example found here: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/how-to-call ) I get an 404 error.
I believe it might be the Python SDK Issue, as when I try the API console - it works just fine. https://westus2.dev.cognitive.microsoft.com/docs/services/TextAnalytics-v3-1/operations/EntitiesRecognitionPii
The code:
key = "key"
endpoint = "https://whatever.cognitiveservices.azure.com/text/analytics/v3.1/entities/recognition/pii?piiCategories=default,FRDriversLicenseNumber/"
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
# Authenticate the client using your key and endpoint
def authenticate_client():
ta_credential = AzureKeyCredential(key)
text_analytics_client = TextAnalyticsClient(
endpoint=endpoint,
credential=ta_credential)
return text_analytics_client
client = authenticate_client()
# Example method for detecting sensitive information (PII) from text
def pii_recognition_example(client):
documents = [
"The employee's SSN is 859-98-0987.",
"The employee's phone number is 555-555-5555."
]
response = client.recognize_pii_entities(documents, language="en")
result = [doc for doc in response if not doc.is_error]
for doc in result:
print("Redacted Text: {}".format(doc.redacted_text))
for entity in doc.entities:
print("Entity: {}".format(entity.text))
print("\tCategory: {}".format(entity.category))
print("\tConfidence Score: {}".format(entity.confidence_score))
print("\tOffset: {}".format(entity.offset))
print("\tLength: {}".format(entity.length))
pii_recognition_example(client)

As it is not stated in the MS docs yet, the endpoint should be kept simple:
endpoint = "https://.cognitiveservices.azure.com"
and the details passed to the response = client.recognize_pii_entities().
The below code works just fine:
key = "key"
endpoint = "https://<name>.cognitiveservices.azure.com"
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
# Authenticate the client using your key and endpoint
def authenticate_client():
ta_credential = AzureKeyCredential(key)
text_analytics_client = TextAnalyticsClient(
endpoint=endpoint,
credential=ta_credential)
return text_analytics_client
client = authenticate_client()
# Example method for detecting sensitive information (PII) from text
def pii_recognition_example(client):
documents = [
"The employee's SSN is 859-98-0987.",
"The employee's phone number is 555-555-5555."
]
response = client.recognize_pii_entities(documents, language="en", categories_filter=["default", "FRDriversLicenseNumber"])
result = [doc for doc in response if not doc.is_error]
for doc in result:
print("Redacted Text: {}".format(doc.redacted_text))
for entity in doc.entities:
print("Entity: {}".format(entity.text))
print("\tCategory: {}".format(entity.category))
print("\tConfidence Score: {}".format(entity.confidence_score))
print("\tOffset: {}".format(entity.offset))
print("\tLength: {}".format(entity.length))
pii_recognition_example(client)

Related

Google API - Extracted comments from a google spreadsheet using Python number only 20?

I have a google spreadsheet with around 3000 rows and I am trying to extract comments from this spreadsheet using the following code:
import requests
from apiclient import errors
from apiclient import discovery
from apiclient.discovery import build
from oauth2client.client import OAuth2WebServerFlow
import httplib2
CLIENT_ID = "xxxxxyyyy"
CLIENT_SECRET = "xxxxxxx"
OAUTH_SCOPE = "https://www.googleapis.com/auth/drive"
REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'
file-id = "zzzzzz"
def retrieve_comments(service, file_id):
"""Retrieve a list of comments.
Args:
service: Drive API service instance.
file_id: ID of the file to retrieve comments for.
Returns:
List of comments.
"""
try:
comments = service.comments().list(fileId=file_id).execute()
return comments.get('items', [])
except errors.HttpError as error:
print(f'An error occurred: {error}')
return None
# ...
flow = OAuth2WebServerFlow(CLIENT_ID,CLIENT_SECRET,OAUTH_SCOPE)
flow.redirect_uri = REDIRECT_URI
authorize_url = flow.step1_get_authorize_url()
print("Go to the following link in your web browser "+ authorize_url)
code = input("Enter verfication code : ").strip()
credentials = flow.step2_exchange(code)
http = httplib2.Http()
http = credentials.authorize(http)
service = build('drive', 'v2', http=http)
comments = retrieve_comments(service, file-id)
However, the length of the list comments is only 20 whereas the spreadsheet surely contains more comments. Could someone explain which parameter I would need to tweak to retrieve all the comments in the spreadsheet? Thanks!
In the current stage, the default value of maxResults (Drive API v2) or pageSize (Drive API v3) of "Comments: list" of Drive API v3 is 20. I thought that this might be the reason for your current issue of However, the length of the list comments is only 20 whereas the spreadsheet surely contains more comments.. In this case, how about the following modification?
From:
comments = service.comments().list(fileId=file_id).execute()
return comments.get('items', [])
To:
From your script, when you want to use Drive API v2, please modify it as follows.
file_id = "###" # Please set your file ID.
res = []
page_token = None
while True:
obj = service.comments().list(fileId=file_id, pageToken=page_token, maxResults=100, fields="*").execute()
if len(obj.get("items", [])) > 0:
res = [*res, *obj.get("items", [])]
page_token = obj.get("nextPageToken")
if not page_token:
break
return res
When you want to use Drive API v3, please modify it as follows.
file_id = "###" # Please set your file ID.
res = []
page_token = None
while True:
obj = service.comments().list(fileId=file_id, pageToken=page_token, pageSize=100, fields="*").execute()
if len(obj.get("comments", [])) > 0:
res = [*res, *obj.get("comments", [])]
page_token = obj.get("nextPageToken")
if not page_token:
break
return res
In this modification, the all comments in the Spreadsheet is returned as an array.
References:
Comments: list of Drive API v2
Comments: list of Drive API v3

Using pagination to return all Azure AD User info in Python

I'm fetching Azure AD data in a Python script. What I'm interested in is specifically name, email and jobtitle from the Users site in Azure.
However, the get request is limited to 100 rows, which I assume has something to do with pagination. Additionally, the jobtitle is empty, which has something to do with the app registration.
QUESTION
How do I return more than 100 rows? I'm struggling with the documentation, and I can't find any Python examples on this?
My code is:
import logging
import json
import msal
import requests
import pandas
# Globals
token = None
graphApiVersion = "v1.0"
uri = "https://graph.microsoft.com/{v}/{r}"
headers = None
# Functions
def authenticate():
global token
global headers
authority = "https://login.microsoftonline.com/<tenant id>"
appID = "<app id>"
appSecret = "<app secret>"
scope = ["https://graph.microsoft.com/.default"]
app = msal.ConfidentialClientApplication(
appID, authority=authority, client_credential = appSecret)
token = app.acquire_token_silent(scope, account=None)
if not token:
token = app.acquire_token_for_client(scopes=scope)
headers = {'Authorization': 'Bearer ' + token['access_token']}
return
def users(Format=True):
return query(graphApiVersion, "/users?$select=displayName,givenName,jobTitle,email,department", Format)
def query(v, r, Format=True):
dest = uri.format(v=v, r=r)
result = requests.get(dest, headers=headers).json()
if Format:
print(pandas.json_normalize(result["value"]))
else:
return result["value"]
You can just use $top param to Page Microsoft Graph data. And if you use this param, you will get an additional link with name: #odata.nextLink to fetch next page data. Details see this official doc.
I also did a quick test on my side with your code as below:
import logging
import json
import msal
import requests
import pandas
# Globals
token = '<access token>'
graphApiVersion = "v1.0"
uri = "https://graph.microsoft.com/{v}/{r}"
headers = None
# Functions
def users(Format=True):
return query(graphApiVersion, "/users?$select=displayName,givenName,jobTitle,email,department&$top=200", Format)
def query(v, r, Format=True):
dest = uri.format(v=v, r=r)
result = requests.get(dest, headers={"Authorization": "Bearer " + token}).json()
if Format:
print(pandas.json_normalize(result["value"] ))
print("next page URL:" + result["#odata.nextLink"])
else:
return result["value"]
users()
It works for me:

im trying to make an api in flask that takes in 2 inputs

ive got an api that takes in an id
http://127.0.0.1:5000/api/v1/resources/books?id=u3qR4Ps4TbATrg97
looks like that
what im trying to do after that is add something to the end of the url, for example
http://127.0.0.1:5000/api/v1/resources/books?id=u3qR4Ps4TbATrg97uid=something
im not 100% sure if this is possible
# Create some test data for our catalog in the form of a list of dictionaries.
books = [
{'id': 'u3qR4Ps4TbATrg97',
'uid': 'what',
'title': 'A Fire Upon the Deep',
'author': 'Vernor Vinge',
'first_sentence': 'The coldsleep itself was dreamless.',
'year_published': '1992'}
]
#app.route('/api/v1/resources/books', methods=['GET'])
def api_id():
# Check if an ID was provided as part of the URL.
# If ID is provided, assign it to a variable.
# If no ID is provided, display an error in the browser.
if 'id' and 'uid' in request.args:
id = str(request.args['id'])
uid = str(request.args['uid'])
else:
return "Error: No id field provided. Please specify an id."
results = []
for book in books:
if book['id'] == id:
results.append(book)
if book['uid'] == uid:
results.append(book)
this is what i have so far, mostly copy pasted from here
thats no the whole file just the important bits i can think of
You can add two inputs inside the GET query like this
http://127.0.0.1:5000/api/v1/resources/books?id=u3qR4Ps4TbATrg97&uid=something
Just put an & in between!
Use request.args.get method to get parameters from your url. Also add & to your URL as a parameter separator.
from flask import Flask, request
app = Flask(__name__)
#app.route('/api/v1/resources/books')
def books():
id_ = request.args.get('id')
uid = request.args.get('uid')
return f'id: {id_}, uid: {uid}'
app.run()
Open http://127.0.0.1:5000/api/v1/resources/books?id=u3qR4Ps4TbATrg97&uid=something
in browser and you'll get:
id: u3qR4Ps4TbATrg97, uid: something
Multiple parameters|arguments are passed with & character. ?params1=5&params2=3. For your example: http://127.0.0.1:5000/api/v1/resources/books?id=u3qR4Ps4TbATrg97&uid=what. For the code, I would do:
from flask import Flask, request, jsonify, make_response
app = Flask(__name__)
# Create some test data for our catalog in the form of a list of dictionaries.
books = [
{
"id": "u3qR4Ps4TbATrg97",
"uid": "what",
"title": "A Fire Upon the Deep",
"author": "Vernor Vinge",
"first_sentence": "The coldsleep itself was dreamless.",
"year_published": "1992",
}
]
#app.route("/api/v1/resources/books", methods=["GET"])
def api_id():
# Check if an ID was provided as part of the URL.
# If ID is provided, assign it to a variable.
# If no ID is provided, display an error in the browser.
if set(["id","uid"]).intersection(set(request.args)):
id_ = str(request.args["id"])
uid = str(request.args["uid"])
else:
return make_response(
jsonify({"message": "Error: No id field provided. Please specify an id."}),
400,
)
results = []
for book in books:
if book["id"] == id_:
results.append(book)
if book["uid"] == uid:
results.append(book)
response = make_response(
jsonify({"message": results}),
200,
)
response.headers["Content-Type"] = "application/json"
return response
This would return status code 400 if no match and 200 when match

How to list Google Merchant Center Feeds

I am trying to read customers' feeds through Google API. I got an access token.
This is the documentation I followed: https://developers.google.com/shopping-content/developers-guide-python#authentication
Example here shows an ACCOUNT_ID to use, but I don't understand where to receive this info.
import gdata.contentforshopping.client
ACCOUNT_ID = '1234567'
shopping_client = gdata.contentforshopping.client.ContentForShoppingClient(account_id=ACCOUNT_ID)
Here is what i did so far:
import gdata.contentforshopping.client
import gdata.gauth
part = 2
auth_token = gdata.gauth.OAuth2Token(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, scope=SCOPE, user_agent=USER_AGENT)
shopping_client = gdata.contentforshopping.client.ContentForShoppingClient()
authorize_url = auth_token.generate_authorize_url(redirect_uri=APPLICATION_REDIRECT_URI)
if part == 1:
print 'Please visit: %s' % authorize_url
elif part == 2:
query = {'code': 'xxxx'} # received from result of part == 1
auth_token.get_access_token(query)
auth_token.authorize(shopping_client)
accounts = shopping_client.GetClientAccounts()
print(accounts)
It turned out there is no proper way to do this. You have to get an error from old API and parse it to find out your merchant id.
Source: https://groups.google.com/forum/#!topic/google-content-api-for-shopping/3iLEm9puJis

Is There Any Way To Check if a Twitch Stream Is Live Using Python?

I'm just wondering if there is any way to write a python script to check to see if a twitch.tv stream is live?
I'm not sure why my app engine tag was removed, but this would be using app engine.
Since all answers are actually outdated as of 2020-05-02, i'll give it a shot. You now are required to register a developer application (I believe), and now you must use an endpoint that requires a user-id instead of a username (as they can change).
See https://dev.twitch.tv/docs/v5/reference/users
and https://dev.twitch.tv/docs/v5/reference/streams
First you'll need to Register an application
From that you'll need to get your Client-ID.
The one in this example is not a real
TWITCH_STREAM_API_ENDPOINT_V5 = "https://api.twitch.tv/kraken/streams/{}"
API_HEADERS = {
'Client-ID' : 'tqanfnani3tygk9a9esl8conhnaz6wj',
'Accept' : 'application/vnd.twitchtv.v5+json',
}
reqSession = requests.Session()
def checkUser(userID): #returns true if online, false if not
url = TWITCH_STREAM_API_ENDPOINT_V5.format(userID)
try:
req = reqSession.get(url, headers=API_HEADERS)
jsondata = req.json()
if 'stream' in jsondata:
if jsondata['stream'] is not None: #stream is online
return True
else:
return False
except Exception as e:
print("Error checking user: ", e)
return False
I hated having to go through the process of making an api key and all those things just to check if a channel was live, so i tried to find a workaround:
As of june 2021 if you send a http get request to a url like https://www.twitch.tv/CHANNEL_NAME, in the response there will be a "isLiveBroadcast": true if the stream is live, and if the stream is not live, there will be nothing like that.
So i wrote this code as an example in nodejs:
const fetch = require('node-fetch');
const channelName = '39daph';
async function main(){
let a = await fetch(`https://www.twitch.tv/${channelName}`);
if( (await a.text()).includes('isLiveBroadcast') )
console.log(`${channelName} is live`);
else
console.log(`${channelName} is not live`);
}
main();
here is also an example in python:
import requests
channelName = '39daph'
contents = requests.get('https://www.twitch.tv/' +channelName).content.decode('utf-8')
if 'isLiveBroadcast' in contents:
print(channelName + ' is live')
else:
print(channelName + ' is not live')
It looks like Twitch provides an API (documentation here) that provides a way to get that info. A very simple example of getting the feed would be:
import urllib2
url = 'http://api.justin.tv/api/stream/list.json?channel=FollowGrubby'
contents = urllib2.urlopen(url)
print contents.read()
This will dump all of the info, which you can then parse with a JSON library (XML looks to be available too). Looks like the value returns empty if the stream isn't live (haven't tested this much at all, nor have I read anything :) ). Hope this helps!
RocketDonkey's fine answer seems to be outdated by now, so I'm posting an updated answer for people like me who stumble across this SO-question with google.
You can check the status of the user EXAMPLEUSER by parsing
https://api.twitch.tv/kraken/streams/EXAMPLEUSER
The entry "stream":null will tell you that the user if offline, if that user exists.
Here is a small Python script which you can use on the commandline that will print 0 for user online, 1 for user offline and 2 for user not found.
#!/usr/bin/env python3
# checks whether a twitch.tv userstream is live
import argparse
from urllib.request import urlopen
from urllib.error import URLError
import json
def parse_args():
""" parses commandline, returns args namespace object """
desc = ('Check online status of twitch.tv user.\n'
'Exit prints are 0: online, 1: offline, 2: not found, 3: error.')
parser = argparse.ArgumentParser(description = desc,
formatter_class = argparse.RawTextHelpFormatter)
parser.add_argument('USER', nargs = 1, help = 'twitch.tv username')
args = parser.parse_args()
return args
def check_user(user):
""" returns 0: online, 1: offline, 2: not found, 3: error """
url = 'https://api.twitch.tv/kraken/streams/' + user
try:
info = json.loads(urlopen(url, timeout = 15).read().decode('utf-8'))
if info['stream'] == None:
status = 1
else:
status = 0
except URLError as e:
if e.reason == 'Not Found' or e.reason == 'Unprocessable Entity':
status = 2
else:
status = 3
return status
# main
try:
user = parse_args().USER[0]
print(check_user(user))
except KeyboardInterrupt:
pass
Here is a more up to date answer using the latest version of the Twitch API (helix). (kraken is deprecated and you shouldn't use GQL since it's not documented for third party use).
It works but you should store the token and reuse the token rather than generate a new token every time you run the script.
import requests
client_id = ''
client_secret = ''
streamer_name = ''
body = {
'client_id': client_id,
'client_secret': client_secret,
"grant_type": 'client_credentials'
}
r = requests.post('https://id.twitch.tv/oauth2/token', body)
#data output
keys = r.json();
print(keys)
headers = {
'Client-ID': client_id,
'Authorization': 'Bearer ' + keys['access_token']
}
print(headers)
stream = requests.get('https://api.twitch.tv/helix/streams?user_login=' + streamer_name, headers=headers)
stream_data = stream.json();
print(stream_data);
if len(stream_data['data']) == 1:
print(streamer_name + ' is live: ' + stream_data['data'][0]['title'] + ' playing ' + stream_data['data'][0]['game_name']);
else:
print(streamer_name + ' is not live');
📚 Explanation
Now, the Twitch API v5 is deprecated. The helix API is in place, where an OAuth Authorization Bearer AND client-id is needed. This is pretty annoying, so I went on a search for a viable workaround, and found one.
🌎 GraphQL
When inspecting Twitch's network requests, while not being logged in, I found out the anonymous API relies on GraphQL. GraphQL is a query language for APIs.
query {
user(login: "USERNAME") {
stream {
id
}
}
}
In the graphql query above, we are querying a user by their login name. If they are streaming, the stream's id will be given. If not, None will be returned.
🐍 The Final Code
The finished python code, in a function, is below. The client-id is taken from Twitch's website. Twitch uses the client-id to fetch information for anonymous users. It will always work, without the need of getting your own client-id.
import requests
# ...
def checkIfUserIsStreaming(username):
url = "https://gql.twitch.tv/gql"
query = "query {\n user(login: \""+username+"\") {\n stream {\n id\n }\n }\n}"
return True if requests.request("POST", url, json={"query": query, "variables": {}}, headers={"client-id": "kimne78kx3ncx6brgo4mv6wki5h1ko"}).json()["data"]["user"]["stream"] else False
I've created a website where you can play with Twitch's GraphQL API. Refer to the GraphQL Docs for help on GraphQL syntax! There's also Twitch GraphQL API documentation on my playground.
Use the twitch api with your client_id as a parameter, then parse the json:
https://api.twitch.tv/kraken/streams/massansc?client_id=XXXXXXX
Twitch Client Id is explained here: https://dev.twitch.tv/docs#client-id,
you need to register a developer application: https://www.twitch.tv/kraken/oauth2/clients/new
Example:
import requests
import json
def is_live_stream(streamer_name, client_id):
twitch_api_stream_url = "https://api.twitch.tv/kraken/streams/" \
+ streamer_name + "?client_id=" + client_id
streamer_html = requests.get(twitch_api_stream_url)
streamer = json.loads(streamer_html.content)
return streamer["stream"] is not None
I'll try to shoot my shot, just in case someone still needs an answer to this, so here it goes
import requests
import time
from twitchAPI.twitch import Twitch
client_id = ""
client_secret = ""
twitch = Twitch(client_id, client_secret)
twitch.authenticate_app([])
TWITCH_STREAM_API_ENDPOINT_V5 = "https://api.twitch.tv/kraken/streams/{}"
API_HEADERS = {
'Client-ID' : client_id,
'Accept' : 'application/vnd.twitchtv.v5+json',
}
def checkUser(user): #returns true if online, false if not
userid = twitch.get_users(logins=[user])['data'][0]['id']
url = TWITCH_STREAM_API_ENDPOINT_V5.format(userid)
try:
req = requests.Session().get(url, headers=API_HEADERS)
jsondata = req.json()
if 'stream' in jsondata:
if jsondata['stream'] is not None:
return True
else:
return False
except Exception as e:
print("Error checking user: ", e)
return False
print(checkUser('michaelreeves'))
https://dev.twitch.tv/docs/api/reference#get-streams
import requests
# ================================================================
# your twitch client id
client_id = ''
# your twitch secret
client_secret = ''
# twitch username you want to check if it is streaming online
twitch_user = ''
# ================================================================
#getting auth token
url = 'https://id.twitch.tv/oauth2/token'
params = {
'client_id':client_id,
'client_secret':client_secret,
'grant_type':'client_credentials'}
req = requests.post(url=url,params=params)
token = req.json()['access_token']
print(f'{token=}')
# ================================================================
#getting user data (user id for example)
url = f'https://api.twitch.tv/helix/users?login={twitch_user}'
headers = {
'Authorization':f'Bearer {token}',
'Client-Id':f'{client_id}'}
req = requests.get(url=url,headers=headers)
userdata = req.json()
userid = userdata['data'][0]['id']
print(f'{userid=}')
# ================================================================
#getting stream info (by user id for example)
url = f'https://api.twitch.tv/helix/streams?user_id={userid}'
headers = {
'Authorization':f'Bearer {token}',
'Client-Id':f'{client_id}'}
req = requests.get(url=url,headers=headers)
streaminfo = req.json()
print(f'{streaminfo=}')
# ================================================================
This solution doesn't require registering an application
import requests
HEADERS = { 'client-id' : 'kimne78kx3ncx6brgo4mv6wki5h1ko' }
GQL_QUERY = """
query($login: String) {
user(login: $login) {
stream {
id
}
}
}
"""
def isLive(username):
QUERY = {
'query': GQL_QUERY,
'variables': {
'login': username
}
}
response = requests.post('https://gql.twitch.tv/gql',
json=QUERY, headers=HEADERS)
dict_response = response.json()
return True if dict_response['data']['user']['stream'] is not None else False
if __name__ == '__main__':
USERS = ['forsen', 'offineandy', 'dyrus']
for user in USERS:
IS_LIVE = isLive(user)
print(f'User {user} live: {IS_LIVE}')
Yes.
You can use Twitch API call https://api.twitch.tv/kraken/streams/YOUR_CHANNEL_NAME and parse result to check if it's live.
The below function returns a streamID if the channel is live, else returns -1.
import urllib2, json, sys
TwitchChannel = 'A_Channel_Name'
def IsTwitchLive(): # return the stream Id is streaming else returns -1
url = str('https://api.twitch.tv/kraken/streams/'+TwitchChannel)
streamID = -1
respose = urllib2.urlopen(url)
html = respose.read()
data = json.loads(html)
try:
streamID = data['stream']['_id']
except:
streamID = -1
return int(streamID)

Categories

Resources