Skip to content

Commit b474c29

Browse files
author
Steve Cook
committed
Added uClassify scripts, example config and readme notes
1 parent 821b229 commit b474c29

File tree

5 files changed

+157
-0
lines changed

5 files changed

+157
-0
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,10 @@ Resources for analysis of our data e.g. from Fixometer, site stats.
33

44
See https://github.com/TheRestartProject/DataAnalytics/wiki for details on how to install, administer and use the Metabase instance.
55

6+
# Classifier
7+
These scripts are used for creating, training and querying the classifier:
8+
* createClassifier <trainingdata.csv> - setup a new classifier for DeviceCategory and train it with data provided. The training data should be CSV format, with the first field containing all training text and the second field having the category name. Note this will destroy and recreate any existing classifier.
9+
* queryClassifier - input a text and the classifier will return the probabilities of each class (category)
10+
* getClassifierTrainingData - extract training data from database in required format
11+
12+
See document on Google Docs for more details of this and the required credentials.

config/uclassify.ini.example

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[uclassify.com]
2+
User = restartproject
3+
ClassifierName = Device Category
4+
ReadToken = ************
5+
WriteToken = ************
6+
ServiceURL = https://api.uclassify.com/v1/
7+
8+
9+

scripts/createClassifier

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env python3
2+
3+
import requests
4+
import xml.etree.ElementTree as ET
5+
import sys, os
6+
import re, string
7+
import configparser
8+
9+
## createClassifier <trainingdata.xml>
10+
11+
## Load config
12+
config = configparser.ConfigParser()
13+
config.read(os.path.dirname(sys.argv[0]) + '/../config/uclassify.ini')
14+
configBase = config['uclassify.com']
15+
UCLASSIFY_CLASSIFIER = configBase['ClassifierName'] #'Device Category'
16+
UCLASSIFY_WRITE_TOKEN = configBase['WriteToken']
17+
UCLASSIFY_BASE_URL = configBase['serviceURL'] + configBase['User']
18+
19+
USE_API = True
20+
21+
## Load the xml training file supplied in commandline
22+
if len(sys.argv) < 2:
23+
## File not supplied - fatal
24+
sys.exit('Usage: createClassifier <xml training data>')
25+
else:
26+
datasource = sys.argv[1]
27+
28+
e = ET.parse(datasource).getroot()
29+
## Construct a dictionary of training phrases keyed by category name
30+
trainingData = {}
31+
for rowElem in e.iter('row'):
32+
text = rowElem.findall('field[@name="device.text"]')[0].text
33+
cleanText = " ".join(re.sub(r'([^\s\w]|_)+', '', text).split()) ## Preserve spaces and alphanumeric characters only
34+
category = rowElem.findall('field[@name="category.name"]')[0].text
35+
categoryName = " ".join(re.sub(r'([^\s\w]|_)+', '', category).split())
36+
if categoryName in trainingData:
37+
trainingData[categoryName].append(cleanText)
38+
else:
39+
trainingData[categoryName] = [cleanText]
40+
41+
print('Categories: ' + str(trainingData.keys()))
42+
43+
if USE_API:
44+
## Remove classifier if it already exists
45+
response = requests.delete(UCLASSIFY_BASE_URL + '/' + UCLASSIFY_CLASSIFIER, \
46+
headers = {'Authorization': 'Token ' + UCLASSIFY_WRITE_TOKEN }, \
47+
json = {} )
48+
print(response.status_code)
49+
50+
## Create the classifier
51+
response = requests.post(UCLASSIFY_BASE_URL, \
52+
headers = {'Authorization': 'Token ' + UCLASSIFY_WRITE_TOKEN }, \
53+
json = {'classifierName': UCLASSIFY_CLASSIFIER })
54+
print('Create classifier "' + UCLASSIFY_CLASSIFIER + '" : response=' + str(response.status_code))
55+
response.raise_for_status() ## The create action should always succeed, so raise fatal if it fails
56+
print(response.text)
57+
58+
## Create a new uClassify class for each device category
59+
for categoryName in trainingData.keys():
60+
if USE_API:
61+
response = requests.post(UCLASSIFY_BASE_URL + '/' + UCLASSIFY_CLASSIFIER + '/addClass', \
62+
headers = {'Authorization': 'Token ' + UCLASSIFY_WRITE_TOKEN}, \
63+
json = {'className':categoryName})
64+
print('Create class "' + categoryName +'" : response=' + str(response.status_code))
65+
response.raise_for_status() ## Raise fatal if we could not create the class
66+
67+
if USE_API:
68+
for className in trainingData.keys():
69+
print('Training class: "' + className + '"')
70+
print('Training data : "' + str(trainingData[className]) + '"')
71+
response = requests.post(UCLASSIFY_BASE_URL + '/' + UCLASSIFY_CLASSIFIER + '/' + className + '/train', \
72+
headers = {'Authorization': 'Token ' + UCLASSIFY_WRITE_TOKEN}, \
73+
json = {'texts': trainingData[className]})
74+
print(response.status_code)
75+
print(response.text)

scripts/getClassifierTrainingData

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/sh -
2+
3+
REPORTFILE=/tmp/query_result.xml
4+
5+
## Get device brand/model/comment and the category
6+
## Exclude any where category is 'Misc' (idcategories=46)
7+
mysql --xml -u fixometer_root --password=str0ngp4ssw0rd! -t fixometer_dev > $REPORTFILE << ENDSQL
8+
select CONCAT_WS(' ', d.brand, d.model, d.problem) as 'device.text', d.iddevices as 'device.id', c.idcategories as 'category.id', c.name as 'category.name' from devices d, categories c where d.category=c.idcategories and c.idcategories!=46;
9+
ENDSQL
10+
11+
echo "Done; training data is in $REPORTFILE"
12+
exit 0

scripts/testClassifier

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/usr/bin/env python3
2+
3+
import configparser
4+
import sys, os
5+
import requests
6+
import argparse
7+
8+
## Load config
9+
config = configparser.ConfigParser()
10+
config.read(os.path.dirname(sys.argv[0]) + '/../config/uclassify.ini')
11+
configBase = config['uclassify.com']
12+
UCLASSIFY_CLASSIFIER = configBase['ClassifierName'] #'Device Category'
13+
UCLASSIFY_READ_TOKEN = configBase['ReadToken']
14+
UCLASSIFY_BASE_URL = configBase['serviceURL'] + configBase['User']
15+
16+
parser = argparse.ArgumentParser(description='Classify some text by device category using uClassify')
17+
parser.add_argument('text', type=str, help='text to classify')
18+
parser.add_argument('-v', '--verbosity', default=0, action='count', help='increase output verbosity')
19+
args = parser.parse_args()
20+
21+
text = args.text
22+
23+
response = requests.post(UCLASSIFY_BASE_URL + '/' + UCLASSIFY_CLASSIFIER +'/classify', \
24+
headers = {'Authorization': 'Token ' + UCLASSIFY_READ_TOKEN }, \
25+
json = {'texts':[text]})
26+
#print(response.status_code)
27+
response.raise_for_status()
28+
data = response.json()
29+
30+
classification = {}
31+
## Convert to dictionary
32+
for item in data[0]['classification']:
33+
classification[item['className']] = item['p']
34+
35+
# Find the category(s) with the highest probability
36+
mostLikelyClass = sorted(classification, key=classification.__getitem__, reverse=True)[0]
37+
leastLikelyClass = sorted(classification, key=classification.__getitem__, reverse=False)[0]
38+
39+
if classification[mostLikelyClass] > classification[leastLikelyClass]:
40+
#
41+
if args.verbosity >= 2:
42+
# High verbosity - Output all the classes and probabilities
43+
for classResponse in classification:
44+
print('{0: >6}%'.format(str(round(classification[classResponse]*100,2))) + ' : ' + classResponse)
45+
elif args.verbosity == 1:
46+
# Medium verbosity - output the most likely class and its probability
47+
print('{0: >6}%'.format(str(round(classification[mostLikelyClass]*100,2))) + ' : ' + mostLikelyClass)
48+
else:
49+
# Low verbosity - just output the most likely class
50+
print(mostLikelyClass)
51+
else:
52+
print("Classifier was not able to match to a category")
53+
54+

0 commit comments

Comments
 (0)