Skip to content

Commit d9ec484

Browse files
Xml parser (#482)
* updated xml-parser * adding xml_test.go * Improved unit testing * Updated documentation for xml parser * Updated the changelog with recent changes * Updated unit test for multiple children nodes * Implemented PR feedback * Fixed changelog and added more test cases * Added more unit tests and improved naming of current element Co-authored-by: jmwilliams89 <josh.williams@bluemedora.com>
1 parent adad941 commit d9ec484

File tree

7 files changed

+510
-0
lines changed

7 files changed

+510
-0
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
88

99
### Added
1010

11+
- Added the `xml_parser`
12+
13+
## 1.2.13 - 2021-10-29
14+
15+
### Added
16+
1117
- Added the `lazy_quotes` parameter to the csv parser [PR472](https://github.com/observIQ/stanza/pull/472)
1218

1319
### Removed

cmd/stanza/init_common.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
_ "github.com/observiq/stanza/operator/builtin/parser/syslog"
2525
_ "github.com/observiq/stanza/operator/builtin/parser/time"
2626
_ "github.com/observiq/stanza/operator/builtin/parser/uri"
27+
_ "github.com/observiq/stanza/operator/builtin/parser/xml"
2728

2829
_ "github.com/observiq/stanza/operator/builtin/transformer/add"
2930
_ "github.com/observiq/stanza/operator/builtin/transformer/copy"

docs/operators/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Parsers:
2121
- [Syslog](/docs/operators/syslog_parser.md)
2222
- [Severity](/docs/operators/severity_parser.md)
2323
- [Time](/docs/operators/time_parser.md)
24+
- [XML](/docs/operators/xml_parser.md)
2425

2526
Outputs:
2627
- [Google Cloud Logging](/docs/operators/google_cloud_output.md)

docs/operators/xml_parser.md

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
## `xml_parser` operator
2+
3+
The `xml_parser` operator parses the string-type field selected by `parse_from` as XML.
4+
5+
### Configuration Fields
6+
7+
| Field | Default | Description |
8+
| --- | --- | --- |
9+
| `id` | `xml_parser` | A unique identifier for the operator |
10+
| `output` | Next in pipeline | The connected operator(s) that will receive all outbound entries |
11+
| `parse_from` | $ | A [field](/docs/types/field.md) that indicates the field to be parsed as XML |
12+
| `parse_to` | $ | A [field](/docs/types/field.md) that indicates where to parse structured data to |
13+
| `preserve_to` | | Preserves the unparsed value at the specified [field](/docs/types/field.md) |
14+
| `on_error` | `send` | The behavior of the operator if it encounters an error. See [on_error](/docs/types/on_error.md) |
15+
| `if` | | An [expression](/docs/types/expression.md) that, when set, will be evaluated to determine whether this operator should be used for the given entry. This allows you to do easy conditional parsing without branching logic with routers. |
16+
| `timestamp` | `nil` | An optional [timestamp](/docs/types/timestamp.md) block which will parse a timestamp field before passing the entry to the output operator |
17+
| `severity` | `nil` | An optional [severity](/docs/types/severity.md) block which will parse a severity field before passing the entry to the output operator |
18+
19+
20+
### Example Configurations
21+
22+
23+
#### Parse the field `message` as XML
24+
25+
Configuration:
26+
```yaml
27+
- type: xml_parser
28+
parse_from: message
29+
```
30+
31+
<table>
32+
<tr><td> Input record </td> <td> Output record </td></tr>
33+
<tr>
34+
<td>
35+
36+
```json
37+
{
38+
"timestamp": "",
39+
"record": {
40+
"message": "<person age='30'>Jon Smith</person>"
41+
}
42+
}
43+
```
44+
45+
</td>
46+
<td>
47+
48+
```json
49+
{
50+
"timestamp": "",
51+
"record": {
52+
"tag": "person",
53+
"attributes": {
54+
"age": "30"
55+
},
56+
"content": "Jon Smith"
57+
}
58+
}
59+
```
60+
61+
</td>
62+
</tr>
63+
</table>
64+
65+
#### Parse multiple xml elements
66+
67+
Configuration:
68+
```yaml
69+
- type: xml_parser
70+
parse_from: message
71+
```
72+
73+
<table>
74+
<tr><td> Input record </td> <td> Output record </td></tr>
75+
<tr>
76+
<td>
77+
78+
```json
79+
{
80+
"timestamp": "",
81+
"record": {
82+
"message": "<person age='30'>Jon Smith</person><person age='28'>Sally Smith</person>"
83+
}
84+
}
85+
```
86+
87+
</td>
88+
<td>
89+
90+
```json
91+
{
92+
"timestamp": "",
93+
"record": [
94+
{
95+
"tag": "person",
96+
"attributes": {
97+
"age": "30"
98+
},
99+
"content": "Jon Smith"
100+
},
101+
{
102+
"tag": "person",
103+
"attributes": {
104+
"age": "28"
105+
},
106+
"content": "Sally Smith"
107+
}
108+
]
109+
}
110+
```
111+
112+
#### Parse embedded xml elements
113+
114+
Configuration:
115+
```yaml
116+
- type: xml_parser
117+
parse_from: message
118+
```
119+
120+
<table>
121+
<tr><td> Input record </td> <td> Output record </td></tr>
122+
<tr>
123+
<td>
124+
125+
```json
126+
{
127+
"timestamp": "",
128+
"record": {
129+
"message": "<worker><person age='30'>Jon Smith</person></worker>"
130+
}
131+
}
132+
```
133+
134+
</td>
135+
<td>
136+
137+
```json
138+
{
139+
"timestamp": "",
140+
"record": {
141+
"tag": "worker",
142+
"children": [
143+
{
144+
"tag": "person",
145+
"attributes": {
146+
"age": "30"
147+
},
148+
"content": "Jon Smith"
149+
}
150+
]
151+
}
152+
}
153+
```
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
package xml
2+
3+
import (
4+
"bytes"
5+
"encoding/xml"
6+
)
7+
8+
// Element represents an XML element
9+
type Element struct {
10+
Tag string
11+
Content string
12+
Attributes map[string]string
13+
Children []*Element
14+
Parent *Element
15+
}
16+
17+
// convertToMap converts an element to a map
18+
func convertToMap(element *Element) map[string]interface{} {
19+
results := map[string]interface{}{}
20+
results["tag"] = element.Tag
21+
22+
if element.Content != "" {
23+
results["content"] = element.Content
24+
}
25+
26+
if len(element.Attributes) > 0 {
27+
results["attributes"] = element.Attributes
28+
}
29+
30+
if len(element.Children) > 0 {
31+
results["children"] = convertToMaps(element.Children)
32+
}
33+
34+
return results
35+
}
36+
37+
// convertToMaps converts a slice of elements to a slice of maps
38+
func convertToMaps(elements []*Element) []map[string]interface{} {
39+
results := []map[string]interface{}{}
40+
for _, e := range elements {
41+
results = append(results, convertToMap(e))
42+
}
43+
44+
return results
45+
}
46+
47+
// newElement creates a new element for the given xml start element
48+
func newElement(element xml.StartElement) *Element {
49+
return &Element{
50+
Tag: element.Name.Local,
51+
Attributes: getAttributes(element),
52+
}
53+
}
54+
55+
// getAttributes returns the attributes of the given element
56+
func getAttributes(element xml.StartElement) map[string]string {
57+
if len(element.Attr) == 0 {
58+
return nil
59+
}
60+
61+
attributes := map[string]string{}
62+
for _, attr := range element.Attr {
63+
key := attr.Name.Local
64+
attributes[key] = attr.Value
65+
}
66+
67+
return attributes
68+
}
69+
70+
// getValue returns value of the given char data
71+
func getValue(data xml.CharData) string {
72+
return string(bytes.TrimSpace(data))
73+
}

operator/builtin/parser/xml/xml.go

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
package xml
2+
3+
import (
4+
"context"
5+
"encoding/xml"
6+
"fmt"
7+
"io"
8+
"strings"
9+
10+
"github.com/observiq/stanza/entry"
11+
"github.com/observiq/stanza/operator"
12+
"github.com/observiq/stanza/operator/helper"
13+
)
14+
15+
func init() {
16+
operator.Register("xml_parser", func() operator.Builder { return NewXMLParserConfig("") })
17+
}
18+
19+
// NewXMLParserConfig creates a new XML parser config with default values
20+
func NewXMLParserConfig(operatorID string) *XMLParserConfig {
21+
return &XMLParserConfig{
22+
ParserConfig: helper.NewParserConfig(operatorID, "xml_parser"),
23+
}
24+
}
25+
26+
// XMLParserConfig is the configuration of an XML parser operator.
27+
type XMLParserConfig struct {
28+
helper.ParserConfig `yaml:",inline"`
29+
}
30+
31+
// Build will build an XML parser operator.
32+
func (c XMLParserConfig) Build(context operator.BuildContext) ([]operator.Operator, error) {
33+
parserOperator, err := c.ParserConfig.Build(context)
34+
if err != nil {
35+
return nil, err
36+
}
37+
38+
xmlParser := &XMLParser{
39+
ParserOperator: parserOperator,
40+
}
41+
42+
return []operator.Operator{xmlParser}, nil
43+
}
44+
45+
// XMLParser is an operator that parses XML.
46+
type XMLParser struct {
47+
helper.ParserOperator
48+
}
49+
50+
// Process will parse an entry for XML.
51+
func (x *XMLParser) Process(ctx context.Context, entry *entry.Entry) error {
52+
return x.ParserOperator.ProcessWith(ctx, entry, parse)
53+
}
54+
55+
// parse will parse an xml value
56+
func parse(value interface{}) (interface{}, error) {
57+
strValue, ok := value.(string)
58+
if !ok {
59+
return nil, fmt.Errorf("value passed to parser is not a string")
60+
}
61+
62+
reader := strings.NewReader(strValue)
63+
decoder := xml.NewDecoder(reader)
64+
token, err := decoder.Token()
65+
if err != nil {
66+
return nil, fmt.Errorf("failed to decode as xml: %w", err)
67+
}
68+
69+
elements := []*Element{}
70+
var parent *Element
71+
var current *Element
72+
73+
for token != nil {
74+
switch token := token.(type) {
75+
case xml.StartElement:
76+
parent = current
77+
current = newElement(token)
78+
current.Parent = parent
79+
80+
if parent != nil {
81+
parent.Children = append(parent.Children, current)
82+
} else {
83+
elements = append(elements, current)
84+
}
85+
case xml.EndElement:
86+
current = parent
87+
if parent != nil {
88+
parent = parent.Parent
89+
}
90+
case xml.CharData:
91+
if current != nil {
92+
current.Content = getValue(token)
93+
}
94+
}
95+
96+
token, err = decoder.Token()
97+
if err != nil && err != io.EOF {
98+
return nil, fmt.Errorf("failed to get next xml token: %w", err)
99+
}
100+
}
101+
102+
switch len(elements) {
103+
case 0:
104+
return nil, fmt.Errorf("no xml elements found")
105+
case 1:
106+
return convertToMap(elements[0]), nil
107+
default:
108+
return convertToMaps(elements), nil
109+
}
110+
}

0 commit comments

Comments
 (0)