{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "$id": "https://get.aiware.com/schemas/v1.2/master.json",
  "title": "vtn-standard",
  "description": "Standard engine output at Veritone",
  "type": "object",
  "definitions": {
    "PREAMBLE": {
      "description": "Metadata for this document: creator, creation time, schemas, etc",
      "type": "object",
      "properties": {
        "schemaId": {
          "$ref": "#/definitions/schemaId"
        },
        "sourceEngineId": {
          "description": "ID of the engine that generated this file (provided by Veritone)",
          "$ref": "#/definitions/guid"
        },
        "sourceEngineName": {
          "description": "Name of the engine that generated this file (provided by Veritone)",
          "type": "string",
          "examples": [
            "capio",
            "Transcription - Medical V3"
          ]
        },
        "taskPayload": {
          "$ref": "#/definitions/taskPayload"
        },
        "taskId": {
          "description": "ID of the task that generated this file (provided by Veritone)",
          "$ref": "#/definitions/guid"
        },
        "generatedDateUTC": {
          "description": "DateTime (ISO8601) of when the engine result was generated (provided by Veritone if not included)",
          "$ref": "#/definitions/dateTime"
        },
        "externalSourceId": {
          "description": "Vendor specific referenceId to this engine result",
          "type": "string"
        },
        "language": {
          "$ref": "#/definitions/language"
        },
        "validationContracts": {
          "$ref": "#/definitions/validationContracts"
        }
      }
    },
    "FILEDATA": {
      "description": "Data in this section applies to the file being analyzed as a whole. This used for expressing summary data that spans the entirety of a media file.",
      "type": "object",
      "properties": {
        "tags": {
          "$ref": "#/definitions/tags"
        },
        "collateIndex": {
          "$ref": "#/definitions/collateIndex"
        },
        "language": {
          "$ref": "#/definitions/language"
        },
        "summary": {
          "$ref": "#/definitions/summary"
        },
        "sentiment": {
          "$ref": "#/definitions/sentiment"
        },
        "gps": {
          "$ref": "#/definitions/gpsCoordinates"
        },
        "emotions": {
          "$ref": "#/definitions/emotions"
        },
        "vendor": {
          "$ref": "#/definitions/vendor"
        }
      }
    },
    "OBJECTDATA": {
      "description": "Data in this section applies to parts of the media that do not include a time component. For example, objects in an image; sentiment of text paragraphs, or translations of an entire document.",
      "type": "array",
      "items": {
        "$ref": "#/definitions/objectResult"
      }
    },
    "TIMESERIESDATA": {
      "description": "Data in this section applies to specific time ranges within the file. Stores insights from audio and video media referenced by timeslices identified from the start of the media",
      "type": "array",
      "items": {
        "$ref": "#/definitions/seriesItem"
      }
    },
    "schemaId": {
      "description": "vtn-standard.master.schema.json or subschema.schema.json (provided by Veritone)",
      "type": "string",
      "examples": [
        "https://docs.veritone.com/schemas/vtn-standard/aion/aion.json"
      ]
    },
    "guid": {
      "description": "A globally unique ID. Typically a UUID in the 8-4-4-4-12 character format, but it doesn't have to be a UUID",
      "type": "string",
      "examples": [
        "fe261482-af8d-4709-aefb-ff1c19b61eb9",
        "21062204_Rl54mU5vIVED8Y9"
      ]
    },
    "dateTime": {
      "description": "DateTime (ISO8601)",
      "type": "string",
      "format": "date-time",
      "examples": [
        "2017-12-08T17:19:02Z"
      ]
    },
    "validationContracts": {
      "description": "Defines which validation contract(s) should be used to validate this content. May be one of the standard categories or the schemaId of a Structured Data Object",
      "type": "array",
      "minItems": 1,
      "items": [
        {
          "type": "string",
          "anyOf": [
            {
              "enum": [
                "concept",
                "entity",
                "keyword",
                "language",
                "media-translated",
                "object",
                "sentiment",
                "summary",
                "text",
                "transcript",
                "anomaly"
              ]
            },
            {
              "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"
            }
          ]
        }
      ]
    },
    "validated": {
      "description": "When included, requires that the document declare it's validation contracts",
      "required": [
        "validationContracts"
      ]
    },
    "taskPayload": {
      "description": "JSON key/value task payload describing the tasks used to start the engine run (provided by Veritone). Task payload will be a combination of attributes found on https://docs.veritone.com/#/developer/adapters/quick-start/step-2?id=constructing-a-batch-pull-adapter and any default custom fields (https://docs.veritone.com/#/developer/engines/custom-fields/?id=custom-fields)",
      "type": "object",
      "examples": [
        {
          "detectionThreshold": 0.7,
          "processingFPS": 30
        }
      ]
    },
    "tags": {
      "description": "Arbitrary tags associated with this file, object, or series. Typically key-value strings with an optional score. Boolean tags can be represented by only including a keyword, and the value is assumed to be true",
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "key": {
            "type": "string"
          },
          "value": {
            "type": "string",
            "default": "true"
          },
          "score": {
            "type": "number",
            "minimum": 0,
            "maximum": 1
          }
        },
        "required": [
          "key"
        ],
        "examples": [
          {
            "key": "foo",
            "value": "bar",
            "score": 0.13
          },
          {
            "key": "gender",
            "value": "female"
          },
          {
            "key": "groundTruth"
          },
          {
            "key": "moderation",
            "value": "nudity"
          },
          {
            "key": "moderation",
            "value": "fakeNews",
            "score": 0.84
          }
        ]
      }
    },
    "collateIndex": {
      "description": "Collation order used to preserve ordering, especially when parallel processing. Items with lower collate values come before items with higher collate values. Indexes do not have to be sequential, but may not be duplicated. You may use whatever integers are convenient: indexes, frame/page numbers, byte offsets, etc.",
      "type": "integer",
      "examples": [
        0,
        1,
        2,
        18673,
        -22
      ]
    },
    "confidence": {
      "description": "Confidence expressed as a range of 0.0 (not confident) to 1.0 (maximum confidence)",
      "type": "number",
      "minimum": 0,
      "maximum": 1,
      "examples": [
        0.0,
        1.0,
        0.63,
        0.122227
      ]
    },
    "point": {
      "description": "A point describing 2 dimensional coordinates in the media.",
      "type": "object",
      "properties": {
        "x": {
          "description": "A horizontal coordinate as a fraction of the x axis: 0.0 (left edge) to 1.0 (right edge)",
          "type": "number",
          "minimum": 0,
          "maximum": 1,
          "examples": [
            0.0,
            0.4,
            1.0
          ]
        },
        "y": {
          "description": "A vertical coordinate as a fraction of the y axis:  0.0 (top edge) to 1.0 (bottom edge)",
          "type": "number",
          "minimum": 0,
          "maximum": 1,
          "examples": [
            0.0,
            0.4,
            1.0
          ]
        }
      },
      "required": [
        "x",
        "y"
      ],
      "additionalProperties": false
    },
    "boundingPoly": {
      "description": "Defines a polygon describing an element in the source. List points clockwise (counterclockwise is a hole), with an implicit line from last to first point. There is a special case of 1 point for a identification by center point, but otherwise a minimum of 3 points must be provided.",
      "type": "array",
      "items": {
        "$ref": "#/definitions/point"
      },
      "oneOf": [
        {
          "minItems": 1,
          "maxItems": 1
        },
        {
          "minItems": 3
        }
      ]
    },
    "objectCategory": {
      "description": "Optional categories for object recognition. Used for referencing external taxonomies.",
      "type": "array",
      "minItems": 1,
      "items": {
        "type": "object",
        "properties": {
          "class": {
            "type": "string",
            "description": "The class of the object recognized.",
            "examples": [
              "animal",
              "mammal",
              "jerseyNumber"
            ]
          },
          "@id": {
            "type": "string",
            "description": "Private id variable used by the engine to refer to its internal dataset",
            "examples": [
              "kg:/m/0dl567"
            ]
          },
          "confidence": {
            "$ref": "#/definitions/confidence"
          }
        },
        "required": [
          "class"
        ],
        "additionalProperties": false
      }
    },
    "objectResult": {
      "description": "Describes a visual or textural object within the media",
      "type": "object",
      "properties": {
        "type": {
          "$ref": "#/definitions/objectResultType"
        },
        "label": {
          "description": "Main label of this object. Required if the object contains no other identifying information.",
          "type": "string"
        },
        "uri": {
          "description": "URI of the thumbnail to show in the UI. If this is not present, the bounding poly can be used to generate one dynamically.",
          "type": "string"
        },
        "confidence": {
          "$ref": "#/definitions/confidence"
        },
        "text": {
          "$ref": "#/definitions/objectText"
        },
        "language": {
          "$ref": "#/definitions/language"
        },
        "collateIndex": {
          "$ref": "#/definitions/collateIndex"
        },
        "page": {
          "$ref": "#/definitions/page"
        },
        "paragraph": {
          "$ref": "#/definitions/paragraph"
        },
        "sentence": {
          "$ref": "#/definitions/sentence"
        },
        "transcription": {
          "$ref": "#/definitions/transcription"
        },
        "lipVoiceCorrelation": {
          "$ref": "#/definitions/lipVoiceCorrelation"
        },
        "lipMovement": {
          "$ref": "#/definitions/lipMovement"
        },
        "faceLandmarks": {
          "$ref": "#/definitions/faceLandmarks"
        },
        "sentiment": {
          "$ref": "#/definitions/sentiment"
        },
        "emotions": {
          "$ref": "#/definitions/emotions"
        },
        "age": {
          "$ref": "#/definitions/age"
        },
        "objectCategory": {
          "$ref": "#/definitions/objectCategory"
        },
        "boundingPoly": {
          "$ref": "#/definitions/boundingPoly"
        },
        "gps": {
          "$ref": "#/definitions/gpsCoordinates"
        },
        "structuredData": {
          "$ref": "#/definitions/structuredData"
        },
        "vendor": {
          "$ref": "#/definitions/vendor"
        },
        "libraryId": {
          "description": "GUID of the training library used to extract/discover this object",
          "$ref": "#/definitions/guid"
        },
        "entityId": {
          "description": "GUID of the entity in the library that was used to identify this object",
          "$ref": "#/definitions/guid"
        },
        "mode": {
          "type": "string",
          "enum": [
            "enroll",
            "verify"
          ]
        },
        "region": {
          "description": "Region of source a match was found",
          "type": "string",
          "enum": [
            "left",
            "right",
            "top",
            "bottom"
          ]
        },
        "entityIdentifierId": {
          "description": "The unique ID of recognized entity identifier (for example, a specific image).",
          "type": "string"
        }
      },
      "required": [
        "type"
      ],
      "additionalProperties": false
    },
    "objectResultType": {
      "description": "The type of a visual or textural object in the media.",
      "type": "string",
      "enum": [
        "object",
        "face",
        "facial-features",
        "face-verification",
        "speaker-verification",
        "licensePlate",
        "logo",
        "fingerprint",
        "speaker",
        "sound",
        "concept",
        "keyword",
        "text",
        "namedEntity",
        "barcode",
        "anomaly"
      ]
    },
    "objectText": {
      "description": "The extracted textual content of a found object.",
      "type": "string"
    },
    "summary": {
      "description": "Summary of the document.",
      "type": "string"
    },
    "word": {
      "description": "A spoken word in a transcript",
      "type": "string"
    },
    "words": {
      "description": "Transcript: Words possibly spoken in this time slice. If using a lattice: there should be only one 'word' per item, 'bestPath' is required, slices can overlap in time, and other fields are optional. If using alternates: there may be multiple 'word's per item, confidence is required, slices must not overlap in time, and other fields are optional",
      "allOf": [
        {
          "type": "array",
          "minItems": 1,
          "items": {
            "type": "object",
            "properties": {
              "word": {
                "$ref": "#/definitions/word"
              },
              "confidence": {
                "description": "Confidence that this item contains the spoken word. Required if listing alternate words",
                "$ref": "#/definitions/confidence"
              },
              "utteranceLength": {
                "description": "Number of sequential words in this translation element. Required if alternates are listed (for historical reasons)",
                "$ref": "#/definitions/utteranceLength"
              },
              "bestPath": {
                "description": "This word is on the best translation path through a lattice. Required if using a lattice such that the best translation is the one created by discarding everything not on the best path.",
                "type": "boolean"
              }
            },
            "required": [
              "word"
            ],
            "additionalProperties": false
          }
        },
        {
          "type": "array",
          "if": {
            "minItems": 2
          },
          "then": {
            "items": {
              "required": [
                "confidence",
                "utteranceLength"
              ]
            },
            "uniqueItems": true,
            "requireBestPath": true
          }
        }
      ]
    },
    "utteranceLength": {
      "description": "Number of consecutive time-slices the utterance spans. example: 'of'->[('thrones' utteranceLength:2)('their' utterenceLength:1)]->('own' utteranceLength:1)",
      "type": "integer",
      "minimum": 1,
      "default": 1,
      "examples": [
        1,
        2
      ]
    },
    "language": {
      "description": "Language Identification. Format: BCP-47 https://tools.ietf.org/rfc/bcp/bcp47.txt.",
      "type": "string",
      "examples": [
        "en",
        "en-US",
        "fr",
        "ko"
      ]
    },
    "speakerId": {
      "description": "Identification of a speaker. Could be the source of the speaker (e.g. which channel contained the speech) or the identity of a speaker",
      "type": "string",
      "examples": [
        "channel0",
        "speaker1",
        "M1",
        "<libraryId>:<entityId>"
      ]
    },
    "seriesItem": {
      "description": "A time-slice within audio or video media and it's associated data.",
      "type": "object",
      "properties": {
        "startTimeMs": {
          "description": "The start time relative to the start of the media asset in milliseconds.",
          "type": "integer",
          "minimum": 0,
          "examples": [
            0
          ]
        },
        "stopTimeMs": {
          "description": "The stop time relative to the start of the media asset in milliseconds.",
          "type": "integer",
          "minimum": 0,
          "examples": [
            3000
          ]
        },
        "tags": {
          "$ref": "#/definitions/tags"
        },
        "summary": {
          "$ref": "#/definitions/summary"
        },
        "speakerId": {
          "$ref": "#/definitions/speakerId"
        },
        "words": {
          "$ref": "#/definitions/words"
        },
        "language": {
          "$ref": "#/definitions/language"
        },
        "sentiment": {
          "$ref": "#/definitions/sentiment"
        },
        "emotions": {
          "$ref": "#/definitions/emotions"
        },
        "object": {
          "$ref": "#/definitions/objectResult"
        },
        "gps": {
          "$ref": "#/definitions/gpsCoordinates"
        },
        "structuredData": {
          "$ref": "#/definitions/structuredData"
        },
        "vendor": {
          "$ref": "#/definitions/vendor"
        },
        "media": {
          "$ref": "#/definitions/media"
        },
        "libraryId": {
          "description": "GUID of the training library used to extract/discover this object",
          "$ref": "#/definitions/guid"
        },
        "entityId": {
          "description": "GUID of the entity in the library that was used to identify this object",
          "$ref": "#/definitions/guid"
        }
      },
      "required": [
        "startTimeMs",
        "stopTimeMs"
      ],
      "additionalProperties": false
    },
    "medias": {
      "description": "Reference to files when a cognition process outputs files.",
      "type": "array",
      "items": {
        "$ref": "#/definitions/media"
      }
    },
    "media": {
      "description": "A reference to a file output by cognition",
      "type": "object",
      "properties": {
        "assetId": {
          "description": "The ID of the asset output by the engine",
          "type": "string"
        },
        "contentType": {
          "description": "An IANA Media Type: https://www.iana.org/assignments/media-types/media-types.xhtml",
          "type": "string",
          "examples": [
            "image/jpeg",
            "application/json",
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
          ]
        },
        "language": {
          "$ref": "#/definitions/language"
        }
      },
      "required": [
        "assetId"
      ]
    },
    "page": {
      "description": "The page on which the text or insight appears",
      "type": "integer",
      "minimum": 1
    },
    "paragraph": {
      "description": "The paragraph in which the text or insight appears. If page is provided, it must be the paragraph number on the page (paragraph 1 is the first paragraph on the page, even if it is a partial paragraph due to a page-break), otherwise it is the paragraph number for the whole document. Paragraphs are delimited by LF (*nix), CR (mac), or CRLF (DOS).",
      "type": "integer",
      "minimum": 1
    },
    "sentence": {
      "description": "The sentence in which the text or insight appears. If paragraph is provided, it must be the sentence number within the paragraph (sentence 1 is the first sentence in the paragraph, even if it is a partial sentence due to a page-break), otherwise it is the sentence number for the whole document. Sentences are delimited by periods (.), question marks (?) or exclamation points (!) followed by whitespace.",
      "type": "integer",
      "minimum": 1
    },
    "noDocumentIndexing": {
      "description": "Disable document indexing (page/paragraph/sentence) for the object",
      "properties": {
        "page": false,
        "paragraph": false,
        "sentence": false
      }
    },
    "transcription": {
      "description": "Auxiliary output used for speaker verification engines. Provides a confidence score between the transcribed audio and a specified phrase.",
      "type": "object",
      "properties": {
        "text": {
          "type": "string"
        },
        "confidence": {
          "$ref": "#/definitions/confidence"
        }
      },
      "required": [
        "confidence"
      ]
    },
    "sentiment": {
      "description": "General sentiment, either positive (I like you) or negative (I hate you) or a mixture (I hate that I like you).",
      "type": "object",
      "properties": {
        "positiveValue": {
          "description": "Positive sentiment rating from 0.0 (neutral) to 1.0 (extremely positive)",
          "type": "number",
          "minimum": 0,
          "maximum": 1,
          "default": 0
        },
        "positiveConfidence": {
          "description": "Confidence of positive sentiment rating",
          "$ref": "#/definitions/confidence"
        },
        "negativeValue": {
          "description": "Negative sentiment rating from 0.0 (neutral) to 1.0 (extremely negative)",
          "type": "number",
          "minimum": 0,
          "maximum": 1,
          "default": 0
        },
        "negativeConfidence": {
          "description": "Confidence of negative sentiment rating",
          "$ref": "#/definitions/confidence"
        }
      },
      "anyOf": [
        {
          "required": [
            "positiveValue"
          ]
        },
        {
          "required": [
            "negativeValue"
          ]
        }
      ],
      "additionalProperties": false,
      "dependencies": {
        "positiveConfidence": {
          "required": [
            "positiveValue"
          ]
        },
        "negativeConfidence": {
          "required": [
            "negativeValue"
          ]
        }
      }
    },
    "gpsCoordinates": {
      "description": "Global position, direction and motion",
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "latitude": {
            "description": "Latitude in decimal degrees. Positive is north, negative is south",
            "type": "number",
            "minimum": -90.0,
            "maximum": 90.0,
            "examples": [
              38.8951,
              -22.9068
            ]
          },
          "longitude": {
            "description": "Longitude in decimal degrees. East coordinates are positve, and west coordinates can be either negative (preferred) or >180 (e.g. -45=315)",
            "type": "number",
            "minimum": -180.0,
            "maximum": 360.0,
            "examples": [
              78.0421,
              -104.9903,
              213.123
            ]
          },
          "precision": {
            "description": "Precision of coordinate in meters",
            "type": "number",
            "examples": [
              100
            ]
          },
          "direction": {
            "description": "Azimuth the point of view is facing (if no velocity) or traveling (if velocity). 0/360 = north, 90 = east, 180 = south, 270 = west",
            "type": "number",
            "minimum": 0,
            "maximum": 360,
            "examples": [
              "10.1"
            ]
          },
          "velocity": {
            "description": "Speed the point of view is traveling in meters/second",
            "type": "number",
            "examples": [
              14,
              634.6
            ]
          },
          "altitude": {
            "description": "Altitude above sea level in meters",
            "type": "number",
            "examples": [
              11000,
              -27
            ]
          }
        },
        "required": [
          "longitude",
          "latitude"
        ],
        "additionalProperties": false
      }
    },
    "emotions": {
      "description": "Emotions represented in the media. May be used with sentiment to fine-tune, e.g. is a negative sentiment anger or sadness?",
      "type": "array",
      "items": {
        "properties": {
          "emotion": {
            "type": "string",
            "examples": [
              "happy",
              "angry",
              "surprised"
            ]
          },
          "emotionValue": {
            "description": "Strength of the emotion from 0.0 (none) to 1.0 (overwhelming)",
            "type": "number",
            "minimum": 0,
            "maximum": 1
          },
          "emotionConfidence": {
            "description": "Confidence this emotion is correct",
            "$ref": "#/definitions/confidence"
          }
        },
        "required": [
          "emotion"
        ]
      }
    },
    "age": {
      "description": "Age in years",
      "type": "object",
      "properties": {
        "min": {
          "type": "number"
        },
        "max": {
          "type": "number"
        },
        "confidence": {
          "$ref": "#/definitions/confidence"
        }
      }
    },
    "lipVoiceCorrelation": {
      "description": "Used for the facial-features engine to describe how confident it is that the lips are moving in concert with current speech",
      "type": "object",
      "properties": {
        "confidence": {
          "$ref": "#/definitions/confidence"
        }
      }
    },
    "lipMovement": {
      "description": "Used for the facial-features engine to describe how confident it is that the lips are moving to produce speech",
      "type": "object",
      "properties": {
        "confidence": {
          "$ref": "#/definitions/confidence"
        }
      }
    },
    "faceLandmarks": {
      "description": "For facial detection, used to identify the location of various facial features",
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "type": {
            "description": "Which facial feature",
            "type": "string",
            "examples": [
              "mouth",
              "left-eye",
              "chin",
              "glasses"
            ]
          },
          "locationPoly": {
            "$ref": "#/definitions/boundingPoly"
          }
        },
        "required": [
          "type",
          "locationPoly"
        ]
      }
    },
    "structuredData": {
      "description": "Structured Data Objects (SDO) that conforms to a Structured Data Schema (SDS) defined in the Veritone system. The properties of this object must be the UUID of a SDS, with a value that is an SDO that matches the schema. Usually this just consists of a series of name/value pairs.",
      "type": "object",
      "patternProperties": {
        "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$": {
          "description": "This must be data that matches the schema defined by the Veritone Structured Data Schema with the ID of the property name"
        }
      },
      "additionalProperties": false
    },
    "vendor": {
      "description": "Custom data that doesn't conform to any other field. You can add any arbitrary data inside this object, but it will not be indexed, searchable, or have any impact on the system. However it will be returned when reading the data back out.",
      "type": "object"
    }
  }
}
