Module:data consistency check
Documentation for this module may be created at Module:data consistency check/doc
local export = {}
local m_language_data = require("Module:languages/alldata")
local m_language_codes = require('Module:languages/code to canonical name')
local m_language_canonical_names = require('Module:languages/canonical names')
local m_etym_language_data = require("Module:etymology languages/data")
local m_family_data = require('Module:families/data')
local m_script_data = require('Module:scripts/data')
local m_table = require("Module:table")
local Array = require("Module:array")
local messages
local function discrepancy(modname, ...)
if not pcall(function(...) messages[modname]:insert(string.format(...)) end, ...) then
mw.log(...)
end
end
local all_codes = {}
local language_names = {}
local family_names = {}
local script_names = {}
local nonempty_families = {}
local allowed_empty_families = {tbq = true}
local nonempty_scripts = {}
local function link(name)
if not name then
return "???"
elseif name:find("[Ll]anguage$") then
return "[[:Category:" .. name .. "|" .. name .. "]]"
else
return "[[:Category:" .. name .. " language|" .. name .. " language]]"
end
end
local function link_script(name)
if not name then
return "???"
elseif name:find("[Cc]ode$") or name:find("[Ss]emaphore$") then
return "[[:Category:" .. name:gsub("^%l", string.upper) .. "|" .. name .. "]]"
else
return "[[:Category:" .. name .. " script|" .. name .. " script]]"
end
end
local function invalid_keys_message(modname, code, data, invalid_keys, is_script)
local plural = #invalid_keys ~= 1
discrepancy(modname, "The data key%s %s for %s (<code>%s</code>) %s invalid.",
plural and "s" or "",
invalid_keys
:map(
function(key)
return '<code>' .. key .. '</code>'
end)
:concat(", "),
(is_script and link_script or link)(data.canonicalName or data[1]),
code,
plural and "are" or "is")
end
local function check_data_keys(valid_keys, is_script)
valid_keys = Array(valid_keys):to_set()
return function (modname, code, data)
local invalid_keys
for k in pairs(data) do
if not valid_keys[k] then
invalid_keys = invalid_keys or Array()
invalid_keys:insert(k)
end
end
if invalid_keys then
invalid_keys_message(modname, code, data, invalid_keys, is_script)
end
end
end
-- Modification of isArray in [[Module:table]].
local function find_gap(t)
local i = 0
for _ in pairs(t) do
i = i + 1
if t[i] == nil then
return i
end
end
end
local function check_array(modname, code, data, array_name, subarray_name)
local subtable = data
if subarray_name then
subtable = assert(data[subarray_name], subarray_name)
end
local array_type = type(subtable[array_name])
if array_type == "table" then
local gap = find_gap(subtable[array_name])
if gap then
discrepancy(modname, "The %s array in %sthe data table for %s (<code>%s</code>) has a gap at index %d.",
array_name,
subarray_name and "the " .. subarray_name .. " field in " or "",
data.canonicalName or data[1],
code, gap)
end
else
discrepancy(modname, "The %s field in %sthe data table for %s (<code>%s</code>) should be an array (table) but is %s.",
array_name,
subarray_name and "the " .. subarray_name .. " field in " or "",
data.canonicalName or data[1],
code,
array_type == "nil" and "nil" or "a " .. array_type)
end
end
local function check_wikidata_item(modname, code, data, key)
local data_item = data[key]
if data_item == nil then
return
elseif type(data_item) == "number" then
if not require "Module:table".isPositiveInteger(data_item) then
discrepancy(modname, "%g, the Wikidata item id for %s (<code>%s</code>), is not a positive integer or a string in the correct format.",
data_item, data.canonicalName or data[1], code)
end
elseif type(data_item) == "string" then
if not data_item:find "^Q%d+$" then
discrepancy(modname, "%s, the Wikidata item id for %s (<code>%s</code>), is not a string in the correct format or a positive integer.",
data_item, data.canonicalName or data[1], code)
end
end
end
local function check_other_names_or_aliases(modname, code, canonical_name, data, data_key, allow_nested)
local array = data[data_key]
if not array then
return
end
check_array(modname, code, data, data_key)
local names = {}
local function check_other_name(other_name)
if other_name == canonical_name then
discrepancy(modname,
"%s, the canonical name for <code>%s</code>, is repeated in the table of <code>%s</code>.",
canonical_name, code, data_key)
end
if names[other_name] then
discrepancy(modname,
"The name %s is found twice or more in the list of <code>%s</code> for %s (<code>%s</code>).",
other_name, data_key, canonical_name, code)
end
names[other_name] = true
end
for _, other_name in ipairs(array) do
if type(other_name) == "table" then
if not allow_nested then
discrepancy(modname,
"A nested table is found in the list of <code>%s</code> for %s (<code>%s</code>), but isn't allowed.",
data_key, canonical_name, code)
else
for _, on in ipairs(other_name) do
check_other_name(on)
end
end
else
check_other_name(other_name)
end
end
end
local function check_other_names_aliases_varieties(modname, code, canonical_name, data)
if data.otherNames then
check_other_names_or_aliases(modname, code, canonical_name, data, "otherNames")
end
if data.aliases then
check_other_names_or_aliases(modname, code, canonical_name, data, "aliases")
end
if data.varieties then
check_other_names_or_aliases(modname, code, canonical_name, data, "varieties", true)
end
end
local get_codepoint = mw.ustring.codepoint
local function validate_pattern(pattern, modname, code, data, standardChars)
if type(pattern) ~= "string" then
discrepancy(modname, '"%s", the %spattern for %s (<code>%s</code>), is not a string.',
pattern, standardChars and 'standard character ' or '', code, data.canonicalName)
end
local ranges
for lower, higher in mw.ustring.gmatch(pattern, "(.)%-(.)") do
if get_codepoint(lower) >= get_codepoint(higher) then
ranges = ranges or Array()
table.insert(ranges, { lower, higher })
end
end
if ranges and ranges[1] then
local plural = #ranges ~= 1 and "s" or ""
discrepancy(modname, '%s (<code>%s</code>) specifies an invalid pattern ' ..
'for %scharacter detection: <code>"%s"</code>. The first codepoint%s ' ..
'in the range%s %s %s must be less than the second.',
link(data.canonicalName), code, standardChars and 'standard ' or '', pattern, plural, plural,
ranges
:map(
function(range)
return range[1] .. "-" .. range[2] .. (" (U+%X, U+%X)")
:format(get_codepoint(range[1]), get_codepoint(range[2]))
end)
:concat(", "),
#ranges ~= 1 and "are" or "is")
end
if not pcall(mw.ustring.find, "", "[" .. pattern .. "]") then
discrepancy(modname, '%s (<code>%s</code>) specifies an invalid pattern for ' ..
(standardChars and 'standard' or '') .. ' character detection: <code>"%s"</code>',
link(data.canonical_name), code, pattern)
end
end
local function check_entry_name_or_sortkey(modname, code, data, replacements_name)
local replacements = data[replacements_name]
if type(replacements) == "string" then
if replacements_name ~= "sort_key" and replacements_name ~= "entry_name" then
discrepancy(modname, "The %s field in the data table for %s (<code>%s</code>) must be a table.",
replacements_name, data.canonicalName, code)
end
return
end
if (replacements.from ~= nil) ~= (replacements.to ~= nil) then
discrepancy(modname,
"The <code>from</code> and <code>to</code> arrays in the <code>%s</code> table for %s (<code>%s</code>) are not both defined or both undefined.",
replacements_name, data.canonicalName, code)
elseif replacements.from then
for _, key in ipairs { "from", "to" } do
check_array(modname, code, data, key, replacements_name)
end
end
if replacements.remove_diacritics and type(replacements.remove_diacritics) ~= "string" then
discrepancy(modname,
"The <code>remove_diacritics</code> field in the <code>%s</code> table for %s (<code>%s</code>) table must be a string.",
replacements_name, data.canonicalName, code)
end
if replacements.from and replacements.to
and m_table.length(replacements.to) > m_table.length(replacements.from) then
discrepancy(modname,
"The <code>from</code> array in the <code>%s</code> table for %s (<code>%s</code>) must be shorter or the same length as the <code>to</code> array.",
replacements_name, data.canonicalName, code)
end
end
local function has_regular_language_child(parent_code)
for code, data in pairs(m_language_data) do
local ancestors = data.ancestors
if ancestors then
for _, ancestor in pairs(ancestors) do
if ancestor == parent_code then
return true
end
end
end
end
return false
end
local function check_ancestors(modname, code, data, ancestors, is_etymology_language)
check_array(modname, code, data, "ancestors")
local canonical_name = data[1] or data.canonicalName
if is_etymology_language then
if not has_regular_language_child(code) then
discrepancy(modname,
"The etymology language %s (<code>%s</code>) has an <code>ancestors</code> field, "
.. "but no regular languages list it as an ancestor.",
link(canonical_name), code)
end
end
for _, ancestor_code in ipairs(ancestors) do
if not (m_language_data[ancestor_code] or m_etym_language_data[ancestor_code]) then
discrepancy(modname,
"%s (<code>%s</code>) lists an invalid language code <code>%s</code> as ancestor.",
link(canonical_name), code, ancestor_code)
end
end
end
-- Just trying to not have a module error when someone puts a script code
-- in the position of a language code.
local function show_family_code(code)
if type(code) == "string" then
return "<code>" .. code .. "</code>"
else
return require("Module:debug").highlight_dump(code)
end
end
local function check_languages()
local check_language_data_keys = check_data_keys{
1, 2, 3, 4, -- canonical name, wikidata item, family, scripts
"entry_name", "sort_key", "display",
"otherNames", "aliases", "varieties",
"type", "scripts", "ancestors",
"wikimedia_codes", "wikipedia_article", "standardChars",
"translit_module", "override_translit", "link_tr",
"dotted_dotless_i"
}
local function check_language(modname, code, data, exdata)
local canonical_name, lang_type = data[1], data.type
check_language_data_keys(modname, code, data)
if all_codes[code] then
discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code])
else
if not m_language_codes[code] then
discrepancy("languages/code to canonical name", "The code <code>%s</code> (%s) is missing.", code, canonical_name)
end
all_codes[code] = modname
end
if not canonical_name then
discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
elseif language_names[canonical_name] then
discrepancy(modname,
"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
link(canonical_name), code, language_names[canonical_name])
else
if not m_language_canonical_names[canonical_name] then
discrepancy("languages/canonical names", "The canonical name %s (<code>%s</code>) is missing.", canonical_name, code)
end
language_names[canonical_name] = code
end
check_wikidata_item(modname, code, data, 2)
if exdata then
check_other_names_aliases_varieties(modname, code, canonical_name, exdata)
end
if lang_type and not (lang_type == "regular" or lang_type == "reconstructed" or lang_type == "appendix-constructed") then
discrepancy(modname, "%s (<code>%s</code>) is of an invalid type <code>%s</code>.", link(canonical_name), code, data.type)
end
if data.scripts and data[4] then
discrepancy(modname, "%s (<code>%s</code>) has both <code>4</code> and <code>scripts</code>.", link(canonical_name), code)
end
local sc = data.scripts or data[4]
if sc then
check_array(modname, code, data, data.scripts and "scripts" or 4)
if not sc[1] then
discrepancy(modname, "%s (<code>%s</code>) has no scripts listed.", link(canonical_name), code)
else
for _, sccode in ipairs(sc) do
if not m_script_data[sccode] then
discrepancy(modname,
"%s (<code>%s</code>) lists an invalid script code <code>%s</code>.",
link(canonical_name), code, sccode)
end
nonempty_scripts[sccode] = true
end
end
end
if data.ancestors then
check_ancestors(modname, code, data, data.ancestors, false)
end
if data[3] then
local family = data[3]
if not m_family_data[family] then
discrepancy(modname,
"%s (<code>%s</code>) has an invalid family code %s.",
link(canonical_name), code, show_family_code(family))
end
nonempty_families[family] = true
end
if data.sort_key then
check_entry_name_or_sortkey(modname, code, data, "sort_key")
end
if data.entry_name then
check_entry_name_or_sortkey(modname, code, data, "entry_name")
end
if data.display then
check_entry_name_or_sortkey(modname, code, data, "display")
end
if data.standardChars then
validate_pattern(data.standardChars, modname, code, data, true)
end
if data.override_translit and not data.translit_module then
discrepancy(modname,
"%s (<code>%s</code>) has <code>override_translit</code> set, but no transliteration module",
link(canonical_name), code)
end
if not (data.link_tr == nil or data.link_tr == true) then
discrepancy(modname,
"%s (<code>%s</code>) has an <code>link_tr</code> value that is not <code>nil</code> or <code>true</code>: %s",
link(canonical_name), code,
tostring(data.link_tr)
)
end
end
-- Check two-letter codes
local modname = "languages/data2"
local data2 = require("Module:" .. modname)
local extradata2 = require("Module:" .. modname:gsub("data", "extradata"))
for code, data in pairs(data2) do
if not code:find("^[a-z][a-z]$") then
discrepancy(modname, '%s (<code>%s</code>) does not have a two-letter code.', link(data.canonicalName), code)
end
check_language(modname, code, data, extradata2[code])
end
-- Check three-letter codes
for i = string.byte('a'), string.byte('z') do
local letter = string.char(i)
local modname = "languages/data3/" .. letter
local data3 = require("Module:" .. modname)
local extradata3 = require("Module:" .. modname:gsub("data", "extradata"))
local code_pattern = "^" .. letter .. "[a-z][a-z]$"
for code, data in pairs(data3) do
if not code:find(code_pattern) then
discrepancy(modname,
'%s (<code>%s</code>) does not have a three-letter code starting with "<code>%s</code>".',
link(data.canonicalName), code, letter)
end
check_language(modname, code, data, extradata3[code])
end
end
-- Check exceptional codes
modname = "languages/datax"
local datax = require("Module:" .. modname)
local extradatax = require("Module:" .. modname:gsub("data", "extradata"))
for code, data in pairs(datax) do
if code:find("^[a-z][a-z][a-z]?$") then
discrepancy(modname, '%s (<code>%s</code>) has a two- or three-letter code.', link(data.canonicalName), code)
end
check_language(modname, code, data, extradatax[code])
end
-- These checks must be done while all_codes only contains language codes:
-- that is, after language data modules have been processed, but before
-- etymology languages, families, and scripts have.
local function check_code_and_name(modname, code, canonical_name)
if not all_codes[code] then
if not language_names[canonical_name] then
discrepancy(modname,
"The code <code>%s</code> and the canonical name %s should be removed; they are not found in a submodule of [[Module:languages]].",
code, canonical_name)
else
discrepancy(modname,
"<code>%s</code>, the code for the canonical name %s, is wrong; it should be <code>%s</code>.",
code, canonical_name, language_names[canonical_name])
end
elseif not language_names[canonical_name] then
local data_table = require("Module:" .. all_codes[code])[code]
discrepancy(modname,
"%s, the canonical name for the code <code>%s</code>, is wrong; it should be %s.",
canonical_name, code, data_table[1] or data_table.canonicalName)
end
end
for code, canonical_name in pairs(m_language_codes) do
check_code_and_name("languages/code to canonical name", code, canonical_name)
end
for canonical_name, code in pairs(m_language_canonical_names) do
check_code_and_name("languages/canonical names", code, canonical_name)
end
end
local function check_etym_languages()
local modname = "etymology languages/data"
local check_etymology_language_data_keys = check_data_keys{
"canonicalName", "otherNames", "aliases", "varieties", "parent",
"wikipedia_article", "wikidata_item", "ancestors", "ancestral_to_parent"
}
local function link(name)
if not name then
return "???"
elseif name:find("[Ll]anguage$") then
return name
else
return name .. " language"
end
end
for code, data in pairs(m_etym_language_data) do
local canonical_name, parent, ancestors =
data.canonicalName, data.parent, data.ancestors
check_etymology_language_data_keys(modname, code, data)
if all_codes[code] then
discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code])
else
all_codes[code] = modname
end
if not canonical_name then
discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
elseif language_names[canonical_name] then
--[=[
discrepancy(modname,
"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
link(data.names[1]), code, language_names[data.names[1]])
--]=]
else
language_names[canonical_name] = code
end
check_other_names_aliases_varieties(modname, code, canonical_name, data)
if parent then
if type(parent) ~= "string" then
discrepancy(modname,
"Etymology-only %s (<code>%s</code>) has a parent language or family code that is %s rather than a string.",
link(canonical_name), code, parent == nil and "nil" or "a " .. type(parent))
elseif not (m_language_data[parent] or m_family_data[parent] or m_etym_language_data[parent]) then
discrepancy(modname,
"Etymology-only %s (<code>%s</code>) has invalid parent language or family code <code>%s</code>.",
link(canonical_name), code, parent)
end
nonempty_families[parent] = true
else
discrepancy(modname,
"Etymology-only %s (<code>%s</code>) has no parent language or family code.",
link(canonical_name), code)
end
if ancestors then
check_ancestors(modname, code, data, ancestors, true)
end
check_wikidata_item(modname, code, data, "wikidata_item")
end
local checked = {}
for code, data in pairs(m_etym_language_data) do
local stack = {}
while data do
if checked[data] then
break
end
if stack[data] then
discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
link(data[1] or data.canonicalName), code,
link(m_etym_language_data[data.parent].canonicalName), data.parent
)
break
end
stack[data] = true
code, data = data.parent, data.parent and m_etym_language_data[data.parent]
end
for data in pairs(stack) do
checked[data] = true
end
end
end
local function check_families()
local modname = "families/data"
local check_family_data_keys = check_data_keys{
"canonicalName", "otherNames", "aliases", "varieties", "family",
"protoLanguage", "wikidata_item"
}
local function link(name)
if not name then
return "???"
elseif name:find("[Ll]anguages$") then
return "[[:Category:" .. name .. "|" .. name .. " family]]"
else
return "[[:Category:" .. name .. " languages|" .. name .. " family]]"
end
end
for code, data in pairs(m_family_data) do
check_family_data_keys(modname, code, data)
local canonical_name, family = data.canonicalName, data.family
if all_codes[code] then
discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code])
else
all_codes[code] = modname
end
if not canonical_name then
discrepancy(modname, "<code>%s</code> has no canonical name specified.", code)
elseif family_names[canonical_name] then
discrepancy(modname,
"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
link(canonical_name), code, family_names[canonical_name])
else
family_names[canonical_name] = code
end
check_other_names_aliases_varieties(modname, code, canonical_name, data)
if family then
if family == code and code ~= "qfa-not" then
discrepancy(modname,
"%s (<code>%s</code>) has itself as its family.",
link(canonical_name), code)
elseif not m_family_data[family] then
discrepancy(modname,
"%s (<code>%s</code>) has an invalid parent family code %s.",
link(canonical_name), code, show_family_code(family))
end
nonempty_families[family] = true
end
check_wikidata_item(modname, code, data, "wikidata_item")
end
for code, data in pairs(m_family_data) do
if not (nonempty_families[code] or allowed_empty_families[code]) then
discrepancy(modname, "%s (<code>%s</code>) has no child families or languages.", link(data.canonicalName), code)
end
end
local checked = { ['qfa-not'] = true }
for code, data in pairs(m_family_data) do
local stack = {}
while data do
if checked[code] then
break
end
if stack[code] then
discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
link(data[1] or data.canonicalName), code,
link(m_family_data[data[3]].canonicalName), data[3]
)
break
end
stack[code] = true
code, data = data.family, m_family_data[data[3]]
end
for code in pairs(stack) do
checked[code] = true
end
end
end
local function check_scripts()
local modname = "scripts/data"
local check_script_data_keys = check_data_keys({
"canonicalName", "otherNames", "aliases", "varieties", "parent",
"systems", "wikipedia_article", "characters", "direction",
"character_category",
}, true)
local m_script_codes = require('Module:scripts/code to canonical name')
local m_script_canonical_names = require('Module:scripts/by name')
for code, data in pairs(m_script_data) do
local canonical_name = data.canonicalName
if not m_script_codes[code] and #code == 4 then
discrepancy('scripts/code to canonical name', '<code>%s</code> (%s) is missing', code, canonical_name)
end
check_script_data_keys(modname, code, data)
if not canonical_name then
discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
elseif script_names[canonical_name] then
--[=[
discrepancy(modname,
"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
link_script(data.names[1]), code, script_names[data.names[1]])
--]=]
else
if not m_script_canonical_names[canonical_name] and #code == 4 then
discrepancy('scripts/by name', '%s (<code>%s</code>) is missing', canonical_name, code)
end
script_names[canonical_name] = code
end
check_other_names_aliases_varieties(modname, code, canonical_name, data)
if not nonempty_scripts[code] then
discrepancy(modname,
"%s (<code>%s</code>) is not used by any language%s.",
link_script(canonical_name), code, data.characters and ""
or " and has no characters listed for auto-detection")
--[[
elseif not data.characters then
discrepancy(modname, "%s (<code>%s</code>) has no characters listed for auto-detection.", link_script(canonical_name), code)
--]]
end
if data.characters then
validate_pattern(data.characters, modname, code, data, false)
end
end
end
-- Warning: cannot be called twice in the same module invocation because
-- some module-global variables are not reset between calls.
function export.do_checks()
messages = setmetatable({}, {
__index = function (self, k)
local val = Array()
self[k] = val
return val
end
})
check_languages()
check_etym_languages()
-- families and scripts must be checked AFTER languages; languages checks fill out
-- the nonempty_families and nonempty_scripts tables, used for testing if a family/script
-- is ever used in the data
check_families()
check_scripts()
setmetatable(messages, nil)
local function find_code(message)
return string.match(message, "<code>([^<]+)</code>")
end
find_code = require("Module:fun").memoize(find_code)
local function comp(message1, message2)
local code1, code2 = find_code(message1), find_code(message2)
if code1 and code2 then
return code1 < code2
else
return message1 < message2
end
end
for modname, msglist in pairs(messages) do
msglist:sort(comp)
end
local ret = messages
messages = nil
return ret
end
function export.format_message(modname, msglist)
return '===[[Module:' .. modname .. ']]==='
.. msglist
:map(
function(msg)
return "\n* " .. msg
end)
:concat()
end
function export.check_modules(...)
local ret = Array()
local messages = export.do_checks()
for _, module in ipairs {...} do
local msglist = messages[module]
if msglist then
ret:insert(export.format_message(module, msglist))
end
end
return ret:concat("\n")
end
function export.check_modules_t(frame)
local args = m_table.shallowcopy(frame.args)
return export.check_modules(unpack(args))
end
function export.perform(frame)
local messages = export.do_checks()
-- Format the messages
local ret = Array()
for modname, msglist in m_table.sortedPairs(messages) do
ret:insert(export.format_message(modname, msglist))
end
-- Are there any messages?
if i == 1 then
return '<b class="success">Glory to Arstotzka.</b>'
else
ret:insert(1, '<b class="warning">Discrepancies detected:</b>')
return ret:concat('\n')
end
end
return export