Test and Validate Transforms

Using the validator API Endpoint

The validator API endpoint allows you to test your transform with sample data. It will interpret both the transform and data and output how Hydrolix will parse and enrich it.

Example:

curl -s \
     -H 'content-type: application/json' \
     -H 'x-hdx-table: sample_project.sample_table' \
     -H 'x-hdx-test: true' \
     https://$host/validator \
     --data-binary '
{
  "transform": {
    "settings": {
      "sql_transform": "SELECT primary, -some_int32 AS `some_int32`, COALESCE(some_other_int32, 789) AS `some_other_int32` FROM {STREAM}",
      "output_columns": [
        {
          "name": "primary",
          "datatype": {
            "type": "datetime",
            "format": "02/01/2006 15:04:05",
            "primary": true
          }
        },
        {
          "name": "some_int32",
          "datatype": {
            "type": "int32"
          }
        },
        {
          "name": "some_other_int32",
          "datatype": {
            "type": "int32"
          }
        }
      ]
    }
  },
  "data": [
    {"primary": "14/06/2022 13:06:47", "some_int32": 99, "some_other_int32": 1234},
    {"primary": "14/06/2022 13:06:48", "some_int32": 99},
    {"primary": "14/06/2022 13:06:49", "some_int32": 123},
    {"primary": "14/06/2022 13:06:50", "some_int32": 456}
  ]
}

Response:

HTTP/1.1 200 OK
Content-Type: application/json; charset=utf-8
Date: Mon, 18 Jul 2022 08:49:37 GMT
Connection: close
Transfer-Encoding: chunked

{
  "code": 200,
  "message": "success",
  "project": {
    "name": "",
    "uuid": ""
  },
  "table": {
    "name": "",
    "uuid": ""
  },
  "transform": {
    "name": "",
    "uuid": ""
  },
  "parsed": [
    [
      {
        "name": "primary",
        "datatype": "datetime",
        "value": "2022-06-14 13:06:47"
      },
      {
        "name": "some_int32",
        "datatype": "int32",
        "value": 99
      },
      {
        "name": "some_other_int32",
        "datatype": "int32",
        "value": 1234
      }
    ],
    [
      {
        "name": "primary",
        "datatype": "datetime",
        "value": "2022-06-14 13:06:48"
      },
      {
        "name": "some_int32",
        "datatype": "int32",
        "value": 99
      },
      {
        "name": "some_other_int32",
        "datatype": "int32",
        "value": null
      }
    ],
    [
      {
        "name": "primary",
        "datatype": "datetime",
        "value": "2022-06-14 13:06:49"
      },
      {
        "name": "some_int32",
        "datatype": "int32",
        "value": 123
      },
      {
        "name": "some_other_int32",
        "datatype": "int32",
        "value": null
      }
    ],
    [
      {
        "name": "primary",
        "datatype": "datetime",
        "value": "2022-06-14 13:06:50"
      },
      {
        "name": "some_int32",
        "datatype": "int32",
        "value": 456
      },
      {
        "name": "some_other_int32",
        "datatype": "int32",
        "value": null
      }
    ]
  ]
    "partition": "",
    "bytes": 0,
    "rows": 0,
    "columns": 0,
    "min_timestamp": 0,
    "max_timestamp": 0,
    "mem_size": 0,
    "shard_key_hash": "",
    "test_response": {
      "data": [
        [
          "2022-06-14 13:06:47",
          -99,
          1234
        ],
        [
          "2022-06-14 13:06:48",
          -99,
          789
        ],
        [
          "2022-06-14 13:06:49",
          -123,
          789
        ],
        [
          "2022-06-14 13:06:50",
          -456,
          789
        ]
      ],
      "meta": [
        {
          "name": "primary",
          "type": "DateTime"
        },
        {
          "name": "some_int32",
          "type": "Nullable(Int32)"
        },
        {
          "name": "some_other_int32",
          "type": "Nullable(Int32)"
        }
      ],
      "rows": 4,
      "statistics": {
        "bytes_read": 56,
        "elapsed": 0.000934846,
        "rows_read": 4
      }
    },
    "indexing_sql": "SELECT `primary`::DateTime as `primary`,`some_int32`::Nullable(Int32) as `some_int32`,`some_other_int32`::Nullable(Int32) as `some_other_int32` FROM (SELECT `primary`::DateTime as `primary`,`some_int32`::Nullable(Int32) as `some_int32`,`some_other_int32`::Nullable(Int32) as `some_other_int32` FROM (SELECT primary, -some_int32 AS `some_int32`, COALESCE(some_other_int32, 789) AS `some_other_int32` FROM file('e81c5d8c-4804-4dd0-812d-e331e3bad1c0.json', 'JSONCompactEachRow', '`primary` DateTime,`some_int32` Nullable(Int32),`some_other_int32` Nullable(Int32)'))) FORMAT JSONCompact SETTINGS hdx_format_work_dir='', hdx_format_sort_keys='', hdx_format_json_schema='[{\"name\":\"primary\",\"datatype\":{\"type\":\"datetime\",\"primary\":true,\"format\":\"%F %T\",\"resolution\":\"s\"}},{\"name\":\"some_int32\",\"datatype\":{\"type\":\"int32\"}},{\"name\":\"some_other_int32\",\"datatype\":{\"type\":\"int32\"}}]', hdx_format_metadata='{\"project\":{\"id\":\"bb8c48f4-7ac2-4de7-99c7-a3cbc195b265\",\"name\":\"sample_project\",\"revision\":5},\"table\":{\"id\":\"c21408ad-7b48-49e8-88c7-bdd8e0f716f1\",\"name\":\"sample_table\",\"revision\":4},\"transforms\":[{\"id\":\"\",\"name\":\"\"}]}', max_ast_elements=100000, max_expanded_ast_elements=100000"
  },
  "intermediate_data": {
    "/var/lib/indexer/e81c5d8c-4804-4dd0-812d-e331e3bad1c0.json": "WyIyMDIyLTA2LTE0IDEzOjA2OjQ3Iiw5OSwxMjM0XQpbIjIwMjItMDYtMTQgMTM6MDY6NDgiLDk5LG51bGxdClsiMjAyMi0wNi0xNCAxMzowNjo0OSIsMTIzLG51bGxdClsiMjAyMi0wNi0xNCAxMzowNjo1MCIsNDU2LG51bGxdCg=="
  }
}

The intermediate_data is the base64 JSON-encoded interpretation of the data, and the indexing SQL is what is used as SQL statement to generate Hydrolix partitions based the data.

If there are any errors from any component, they'll be included in the response. For example, if the indexer is not able to parse the timestamp or the format, it will output an error.

Delimiters in CSV format

When sending in a transform in CSV format, make sure escaped characters are specified as an ASCII character code rather than the character. For example, to specify a tab delimiter, use the ASCII equivalent. Rather than:

"format_details": {
  "delimiter": "\t",
  ...
}

Use the ASCII equivalent:

"format_details": {
  "delimiter": 9,
  ...
}

This applies to the sample transform formats found in Hydrolix's sample repository. While the CSV formats contained there will work with the Transforms API, some of them must be modified to work with this endpoint.

Using the catchall Feature

The catchall feature allows you to see the shape of your data before you've applied a transform to it. For example, below is a transform with a mapping catchall, followed by three lines of sample data:

POST https://$host/validator
Content-Type: application/json
x-hdx-table: sample_project.sample_table
x-hdx-test: true

{
    "transform":
    {
    "name": "demo",
    "settings": {
        "is_default": true,
        "output_columns": [
            {
                "name": "date",
                "datatype": {
                    "type": "datetime",
                    "format": "2006-01-01",
                    "resolution": "seconds",
                    "primary": true
                }
            },
            {
                "name": "catchall",
                "datatype": {
                    "type": "map",
                    "elements":
                    [
                        {
                            "type": "string"
                        },
                        {
                            "type": "string"
                        }
                    ],
                    "catch_all": true
                }
            }
        ],
        "compression": "none",
        "format_details": {
            "flattening": {
                "active": true,
                "depth": 1,
                "map_flattening_strategy": {
                    "left": "."
                }
            }
        }
    },
    "type": "json"
  },
  "data": [ {"timestamp": "2022-06-13 06:03:05.579 +00:00", "component": "query_executor", "level":"info", "user":"anonymous", "duration": 552, "query": "'select version()'", "message": "comment='' admin_comment=''"},
{"timestamp": "2022-06-13 06:03:06.078 +00:00", "component": "query_executor", "level":"info", "user":"anonymous", "duration": 552, "query": "'select version()'", "message": "comment='' admin_comment=''"},
{"timestamp": "2022-06-13 06:03:06.086 +00:00", "component": "query_executor", "level":"info", "message": "user=anonymous duration_ms=0 comment='' admin_comment='' query='select version()'"}]
}

Here any JSON key value in the sample data will be interpreted as a map and put in the catchall column:

"parsed": [
    [
      {
        "name": "date",
        "datatype": "datetime",
        "value": null
      },
      {
        "name": "catchall",
        "datatype": "map",
        "value": {
          "component": "query_executor",
          "duration": "552",
          "level": "info",
          "message": "comment='' admin_comment=''",
          "query": "'select version()'",
          "timestamp": "2022-06-13 06:03:05.579 +00:00",
          "user": "anonymous"
        }
      }
    ]