epiphany/node_modules/bcp-47/lib/parse.js
2023-12-09 22:19:03 -08:00

263 lines
6.7 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

'use strict'
var alphanumeric = require('is-alphanumerical')
var alphabetical = require('is-alphabetical')
var decimal = require('is-decimal')
var regular = require('./regular.json')
var normal = require('./normalize.json')
module.exports = parse
var own = {}.hasOwnProperty
// Parse a BCP 47 language tag.
/* eslint-disable-next-line complexity */
function parse(tag, options) {
var settings = options || {}
var result = empty()
var source = String(tag)
var value = source.toLowerCase()
var index = 0
var start
var groups
var offset
// Check input.
if (tag == null) {
throw new Error('Expected string, got `' + tag + '`')
}
// Lets start.
// First: the edge cases.
if (own.call(normal, value)) {
if ((settings.normalize == null || settings.normalize) && normal[value]) {
return parse(normal[value])
}
result[regular.indexOf(value) === -1 ? 'irregular' : 'regular'] = source
return result
}
// Now, to actually parse, eat what could be a language.
while (alphabetical(value.charCodeAt(index)) && index < 9) index++
// A language.
if (index > 1 /* Min 639. */ && index < 9 /* Max subtag. */) {
// 5 and up is a subtag.
// 4 is the size of reserved languages.
// 3 an ISO 639-2 or ISO 639-3.
// 2 is an ISO 639-1.
// <https://github.com/wooorm/iso-639-2>
// <https://github.com/wooorm/iso-639-3>
result.language = source.slice(0, index)
if (index < 4 /* Max 639. */) {
groups = 0
while (
value.charCodeAt(index) === 45 /* `-` */ &&
alphabetical(value.charCodeAt(index + 1)) &&
alphabetical(value.charCodeAt(index + 2)) &&
alphabetical(value.charCodeAt(index + 3)) &&
!alphabetical(value.charCodeAt(index + 4))
) {
if (groups > 2 /* Max extended language subtag count. */) {
return fail(
index,
3,
'Too many extended language subtags, expected at most 3 subtags'
)
}
// Extended language subtag.
result.extendedLanguageSubtags.push(source.slice(index + 1, index + 4))
index += 4
groups++
}
}
// ISO 15924 script.
// <https://github.com/wooorm/iso-15924>
if (
value.charCodeAt(index) === 45 /* `-` */ &&
alphabetical(value.charCodeAt(index + 1)) &&
alphabetical(value.charCodeAt(index + 2)) &&
alphabetical(value.charCodeAt(index + 3)) &&
alphabetical(value.charCodeAt(index + 4)) &&
!alphabetical(value.charCodeAt(index + 5))
) {
result.script = source.slice(index + 1, index + 5)
index += 5
}
if (value.charCodeAt(index) === 45 /* `-` */) {
// ISO 3166-1 region.
// <https://github.com/wooorm/iso-3166>
if (
alphabetical(value.charCodeAt(index + 1)) &&
alphabetical(value.charCodeAt(index + 2)) &&
!alphabetical(value.charCodeAt(index + 3))
) {
result.region = source.slice(index + 1, index + 3)
index += 3
}
// UN M49 region.
// <https://github.com/wooorm/un-m49>
else if (
decimal(value.charCodeAt(index + 1)) &&
decimal(value.charCodeAt(index + 2)) &&
decimal(value.charCodeAt(index + 3)) &&
!decimal(value.charCodeAt(index + 4))
) {
result.region = source.slice(index + 1, index + 4)
index += 4
}
}
while (value.charCodeAt(index) === 45 /* `-` */) {
offset = start = index + 1
while (alphanumeric(value.charCodeAt(offset))) {
if (offset - start > 7 /* Max variant. */) {
return fail(
offset,
1,
'Too long variant, expected at most 8 characters'
)
}
offset++
}
if (
// Long variant.
offset - start > 4 /* Min alpha numeric variant. */ ||
// Short variant.
(offset - start > 3 /* Min variant. */ &&
decimal(value.charCodeAt(start)))
) {
result.variants.push(source.slice(start, offset))
index = offset
}
// Something else.
else {
break
}
}
// Extensions.
while (value.charCodeAt(index) === 45 /* `-` */) {
// Exit if this isnt an extension.
if (
value.charCodeAt(index + 1) === 120 /* `x` */ ||
!alphanumeric(value.charCodeAt(index + 1)) ||
value.charCodeAt(index + 2) !== 45 /* `-` */ ||
!alphanumeric(value.charCodeAt(index + 3))
) {
break
}
offset = index + 2
groups = 0
while (
value.charCodeAt(offset) === 45 /* `-` */ &&
alphanumeric(value.charCodeAt(offset + 1)) &&
alphanumeric(value.charCodeAt(offset + 2))
) {
start = offset + 1
offset = start + 2
groups++
while (alphanumeric(value.charCodeAt(offset))) {
if (offset - start > 7 /* Max extension. */) {
return fail(
offset,
2,
'Too long extension, expected at most 8 characters'
)
}
offset++
}
}
if (!groups) {
return fail(
offset,
4,
'Empty extension, extensions must have at least 2 characters of content'
)
}
result.extensions.push({
singleton: source.charAt(index + 1),
extensions: source.slice(index + 3, offset).split('-')
})
index = offset
}
}
// Not a language.
else {
index = 0
}
// Private use.
if (
(index === 0 && value.charCodeAt(index) === 120) /* `x` */ ||
(value.charCodeAt(index) === 45 /* `-` */ &&
value.charCodeAt(index + 1) === 120) /* `x` */
) {
offset = index = index ? index + 2 : 1
while (
value.charCodeAt(offset) === 45 /* `-` */ &&
alphanumeric(value.charCodeAt(offset + 1))
) {
offset = start = index + 1
while (alphanumeric(value.charCodeAt(offset))) {
if (offset - start > 7 /* Max private use. */) {
return fail(
offset,
5,
'Too long private-use area, expected at most 8 characters'
)
}
offset++
}
result.privateuse.push(source.slice(index + 1, offset))
index = offset
}
}
if (index !== source.length) {
return fail(index, 6, 'Found superfluous content after tag')
}
return result
function fail(offset, code, reason) {
if (settings.warning) settings.warning(reason, code, offset)
return settings.forgiving ? result : empty()
}
}
// Create an empty results object.
function empty() {
return {
language: null,
extendedLanguageSubtags: [],
script: null,
region: null,
variants: [],
extensions: [],
privateuse: [],
irregular: null,
regular: null
}
}