Added uClassify scripts, example config and readme notes

Steve Cook · Steve Cook · commit b474c2960a97 · 2018-05-03T08:29:23.000+01:00
diff --git a/README.md b/README.md
@@ -3,3 +3,10 @@ Resources for analysis of our data e.g. from Fixometer, site stats.
 
 See https://github.com/TheRestartProject/DataAnalytics/wiki for details on how to install, administer and use the Metabase instance.
 
+# Classifier
+These scripts are used for creating, training and querying the classifier:
+* createClassifier <trainingdata.csv> - setup a new classifier for DeviceCategory and train it with data provided.  The training data should be CSV format, with the first field containing all training text and the second field having the category name.  Note this will destroy and recreate any existing classifier.
+* queryClassifier - input a text and the classifier will return the probabilities of each class (category)
+* getClassifierTrainingData - extract training data from database in required format
+
+See document on Google Docs for more details of this and the required credentials.
diff --git a/config/uclassify.ini.example b/config/uclassify.ini.example
@@ -0,0 +1,9 @@
+[uclassify.com]
+User = restartproject
+ClassifierName = Device Category
+ReadToken = ************
+WriteToken = ************
+ServiceURL = https://api.uclassify.com/v1/
+
+
+
diff --git a/scripts/createClassifier b/scripts/createClassifier
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+import requests
+import xml.etree.ElementTree as ET
+import sys, os
+import re, string
+import configparser
+
+## createClassifier <trainingdata.xml>
+
+## Load config
+config = configparser.ConfigParser()
+config.read(os.path.dirname(sys.argv[0]) + '/../config/uclassify.ini')
+configBase = config['uclassify.com']
+UCLASSIFY_CLASSIFIER = configBase['ClassifierName']	#'Device Category'
+UCLASSIFY_WRITE_TOKEN = configBase['WriteToken'] 
+UCLASSIFY_BASE_URL = configBase['serviceURL'] + configBase['User']
+
+USE_API = True
+
+## Load the xml training file supplied in commandline
+if len(sys.argv) < 2:
+  ## File not supplied - fatal
+  sys.exit('Usage: createClassifier <xml training data>')
+else:
+  datasource = sys.argv[1]
+
+e = ET.parse(datasource).getroot()
+## Construct a dictionary of training phrases keyed by category name
+trainingData = {}
+for rowElem in e.iter('row'):
+  text = rowElem.findall('field[@name="device.text"]')[0].text
+  cleanText = " ".join(re.sub(r'([^\s\w]|_)+', '', text).split())		## Preserve spaces and alphanumeric characters only
+  category = rowElem.findall('field[@name="category.name"]')[0].text
+  categoryName = " ".join(re.sub(r'([^\s\w]|_)+', '', category).split())
+  if categoryName in trainingData:
+    trainingData[categoryName].append(cleanText)
+  else:
+    trainingData[categoryName] = [cleanText]
+  
+print('Categories: ' + str(trainingData.keys()))
+
+if USE_API:
+  ## Remove classifier if it already exists
+  response = requests.delete(UCLASSIFY_BASE_URL + '/' + UCLASSIFY_CLASSIFIER, \
+      headers = {'Authorization': 'Token ' + UCLASSIFY_WRITE_TOKEN }, \
+      json = {} )
+  print(response.status_code)
+  
+  ## Create the classifier
+  response = requests.post(UCLASSIFY_BASE_URL, \
+      headers = {'Authorization': 'Token ' + UCLASSIFY_WRITE_TOKEN }, \
+      json = {'classifierName': UCLASSIFY_CLASSIFIER })
+  print('Create classifier "' + UCLASSIFY_CLASSIFIER + '" : response=' + str(response.status_code))
+  response.raise_for_status()		## The create action should always succeed, so raise fatal if it fails 
+  print(response.text)
+
+## Create a new uClassify class for each device category  
+for categoryName in trainingData.keys():
+  if USE_API:
+    response = requests.post(UCLASSIFY_BASE_URL + '/' + UCLASSIFY_CLASSIFIER + '/addClass', \
+    headers = {'Authorization': 'Token ' + UCLASSIFY_WRITE_TOKEN}, \
+    json = {'className':categoryName})
+    print('Create class "' + categoryName +'" : response=' + str(response.status_code))
+    response.raise_for_status()	## Raise fatal if we could not create the class
+
+if USE_API:
+  for className in trainingData.keys():
+    print('Training class: "' + className + '"')
+    print('Training data : "' + str(trainingData[className]) + '"') 
+    response = requests.post(UCLASSIFY_BASE_URL + '/' + UCLASSIFY_CLASSIFIER + '/' + className + '/train', \
+      headers = {'Authorization': 'Token ' + UCLASSIFY_WRITE_TOKEN}, \
+      json = {'texts': trainingData[className]})
+    print(response.status_code)
+    print(response.text) 
diff --git a/scripts/getClassifierTrainingData b/scripts/getClassifierTrainingData
@@ -0,0 +1,12 @@
+#!/bin/sh -
+
+REPORTFILE=/tmp/query_result.xml
+
+## Get device brand/model/comment and the category
+## Exclude any where category is 'Misc' (idcategories=46)
+mysql --xml -u fixometer_root --password=str0ngp4ssw0rd! -t fixometer_dev > $REPORTFILE << ENDSQL
+select CONCAT_WS(' ', d.brand, d.model, d.problem) as 'device.text', d.iddevices as 'device.id', c.idcategories as 'category.id', c.name as 'category.name' from devices d, categories c where d.category=c.idcategories and c.idcategories!=46;
+ENDSQL
+
+echo "Done; training data is in $REPORTFILE"
+exit 0
diff --git a/scripts/testClassifier b/scripts/testClassifier
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+import configparser
+import sys, os
+import requests
+import argparse
+
+## Load config
+config = configparser.ConfigParser()
+config.read(os.path.dirname(sys.argv[0]) + '/../config/uclassify.ini')
+configBase = config['uclassify.com']
+UCLASSIFY_CLASSIFIER = configBase['ClassifierName']     #'Device Category'
+UCLASSIFY_READ_TOKEN = configBase['ReadToken'] 
+UCLASSIFY_BASE_URL = configBase['serviceURL'] + configBase['User']
+
+parser = argparse.ArgumentParser(description='Classify some text by device category using uClassify')
+parser.add_argument('text', type=str, help='text to classify')
+parser.add_argument('-v', '--verbosity', default=0, action='count', help='increase output verbosity')
+args = parser.parse_args()
+
+text = args.text
+
+response = requests.post(UCLASSIFY_BASE_URL + '/' + UCLASSIFY_CLASSIFIER +'/classify', \
+    headers = {'Authorization': 'Token ' + UCLASSIFY_READ_TOKEN }, \
+    json = {'texts':[text]})
+#print(response.status_code)
+response.raise_for_status() 
+data = response.json()
+
+classification = {}
+## Convert to dictionary
+for item in data[0]['classification']:
+  classification[item['className']] = item['p']
+
+# Find the category(s) with the highest probability
+mostLikelyClass = sorted(classification, key=classification.__getitem__, reverse=True)[0]
+leastLikelyClass = sorted(classification, key=classification.__getitem__, reverse=False)[0]
+
+if classification[mostLikelyClass] > classification[leastLikelyClass]:
+  # 
+  if args.verbosity >= 2:
+    # High verbosity - Output all the classes and probabilities
+    for classResponse in classification:
+      print('{0: >6}%'.format(str(round(classification[classResponse]*100,2))) + ' : ' + classResponse)
+  elif args.verbosity == 1:
+    # Medium verbosity - output the most likely class and its probability 
+    print('{0: >6}%'.format(str(round(classification[mostLikelyClass]*100,2))) + ' : ' + mostLikelyClass)
+  else:
+    # Low verbosity - just output the most likely class
+    print(mostLikelyClass)
+else:
+  print("Classifier was not able to match to a category")
+
+