Module:Data consistency check

local export = {}

local m_language_data = require("Module:languages/alldata") local m_language_codes = require('Module:languages/code to canonical name') local m_language_canonical_names = require('Module:languages/canonical names') local m_etym_language_data = require("Module:etymology languages/data") local m_family_data = require('Module:families/data') local m_script_data = require('Module:scripts/data')

local m_table = require("Module:table") local Array = require("Module:array")

local messages

local function discrepancy(modname, ...) messages[modname]:insert(string.format(...)) end

local all_codes = {}

local language_names = {} local family_names = {} local script_names = {}

local nonempty_families = {} local allowed_empty_families = {tbq = true} local nonempty_scripts = {} local function link(name) if not name then return "???" elseif name:find("[Ll]anguage$") then return "" .. name .. "" else return "" .. name .. " language" end end local function link_script(name) if not name then return "???" elseif name:find("[Cc]ode$") or name:find("[Ss]emaphore$") then return "" .. name .. "" else return "" .. name .. " script" end end

local function invalid_keys_message(modname, code, data, invalid_keys, is_script) local plural = #invalid_keys ~= 1 discrepancy(modname, "The data key%s %s for %s %s invalid.",		plural and "s" or "",		invalid_keys			:map( function(key) return ' ' end)			:concat(", "),		(is_script and link_script or link)(data.canonicalName or data[1]),		code,		plural and "are" or "is") end

local function check_data_keys(valid_keys, is_script) valid_keys = Array(valid_keys):to_set return function (modname, code, data) local invalid_keys for k in pairs(data) do			if not valid_keys[k] then invalid_keys = invalid_keys or Array invalid_keys:insert(k) end end if invalid_keys then invalid_keys_message(modname, code, data, invalid_keys, is_script) end end end

-- Modification of isArray in Module:table. local function find_gap(t) local i = 0 for _ in pairs(t) do		i = i + 1 if t[i] == nil then return i		end end end

local function check_array(modname, code, data, array_name) local gap = find_gap(data[array_name]) if gap then discrepancy(modname, "The %s array in the data table for %s has a gap at index %d.",			array_name, data.canonicalName or data[1], code, gap) end end

local function check_other_names_or_aliases(modname, code, canonical_name, data, data_key, allow_nested) local array = data[data_key] if not array then return end check_array(modname, code, data, data_key)

local names = {} local function check_other_name(other_name) if other_name == canonical_name then discrepancy(modname,				"%s, the canonical name for, is repeated in the table of  .",				canonical_name, code, data_key) end if names[other_name] then discrepancy(modname,				"The name %s is found twice or more in the list of  for %s .",				other_name, data_key, canonical_name, code) end names[other_name] = true end

for _, other_name in ipairs(array) do		if type(other_name) == "table" then if not allow_nested then discrepancy(modname,					"A nested table is found in the list of  for %s, but isn't allowed.",					data_key, canonical_name, code) else for _, on in ipairs(other_name) do					check_other_name(on) end end else check_other_name(other_name) end end end

local function check_other_names_aliases_varieties(modname, code, canonical_name, data) if data.otherNames then check_other_names_or_aliases(modname, code, canonical_name, data, "otherNames") end if data.aliases then check_other_names_or_aliases(modname, code, canonical_name, data, "aliases") end if data.varieties then check_other_names_or_aliases(modname, code, canonical_name, data, "varieties", true) end end

local get_codepoint = mw.ustring.codepoint local function validate_pattern(pattern, modname, code, data, standardChars) if type(pattern) ~= "string" then discrepancy(modname, '"%s", the %spattern for %s, is not a string.',			pattern, standardChars and 'standard character ' or '', code, data.canonicalName) end local ranges for lower, higher in mw.ustring.gmatch(pattern, "(.)%-(.)") do		if get_codepoint(lower) >= get_codepoint(higher) then ranges = ranges or Array table.insert(ranges, { lower, higher }) end end if ranges and ranges[1] then local plural = #ranges ~= 1 and "s" or "" discrepancy(modname, '%s specifies an invalid pattern ' ..			'for %scharacter detection:  . The first codepoint%s ' ..			'in the range%s %s %s must be less than the second.',			link(data.canonicalName), code, standardChars and 'standard ' or '', pattern, plural, plural,			ranges				:map( function(range) return range[1] .. "-" .. range[2] .. (" (U+%X, U+%X)") :format(get_codepoint(range[1]), get_codepoint(range[2])) end)				:concat(", "),			#ranges ~= 1 and "are" or "is") end if not pcall(mw.ustring.find, "", "[" .. pattern .. "]") then discrepancy(modname, '%s specifies an invalid pattern for ' ..			(standardChars and 'standard' or '') .. ' character detection:  ',			link(data.canonical_name), code, pattern) end end

local function check_entry_name_or_sortkey(modname, code, data, replacements_name) local replacements = data[replacements_name] if type(replacements) == "string" then if replacements_name ~= "sort_key" then discrepancy(modname, "The %s field in the data table for %s must be a table.",				replacements_name, data.canonicalName, code) end return end if (replacements.from ~= nil) ~= (replacements.to ~= nil) then discrepancy(modname,			"The  and   arrays in the   table for %s  are not both defined or both undefined.",			replacements_name, data.canonicalName, code) elseif replacements.from then for _, key in ipairs { "from", "to" } do			local gap = find_gap(replacements[key]) if gap then discrepancy(modname,					"The %s array in the %s table for %s has a gap at index %d.",					key, replacements_name, data.canonicalName, code, gap) end end end if replacements.remove_diacritics and type(replacements.remove_diacritics) ~= "string" then discrepancy(modname,			"The  field in the   table for %s  table must be a string.",			replacements_name, data.canonicalName, code) end if replacements.from and replacements.to			and m_table.length(replacements.to) > m_table.length(replacements.from) then discrepancy(modname,			"The  array in the   table for %s  must be shorter or the same length as the   array.",			replacements_name, data.canonicalName, code) end end

local function has_regular_language_child(parent_code) for code, data in pairs(m_language_data) do		local ancestors = data.ancestors if ancestors then for _, ancestor in pairs(ancestors) do				if ancestor == parent_code then return true end end end end return false end

local function check_ancestors(modname, code, data, ancestors, is_etymology_language) check_array(modname, code, data, "ancestors") local canonical_name = data[1] or data.canonicalName if is_etymology_language then if not has_regular_language_child(code) then discrepancy(modname,				"The etymology language %s has an   field, "				.. "but no regular languages list it as an ancestor.",				link(canonical_name), code) end end for _, ancestor_code in ipairs(ancestors) do		if not (m_language_data[ancestor_code] or m_etym_language_data[ancestor_code]) then discrepancy(modname,				"%s lists an invalid language code   as ancestor.",				link(canonical_name), code, ancestor_code) end end end

local function check_languages local check_language_data_keys = check_data_keys{ 1, 2, 3, -- canonical name, wikidata item, family "entry_name", "sort_key", "otherNames", "aliases", "varieties", "type", "scripts", "ancestors", "wikimedia_codes", "wikipedia_article", "standardChars", "translit_module", "override_translit", "link_tr", }	local function check_language(modname, code, data) local canonical_name, wikidata_item, lang_type = data[1], data[2], data.type check_language_data_keys(modname, code, data) if all_codes[code] then discrepancy(modname, "Code  is not unique; it is also defined in Module:%s.", code, all_codes[code]) else if not m_language_codes[code] then discrepancy("languages/code to canonical name", "The code  (%s) is missing.", code, canonical_name) end all_codes[code] = modname end if not canonical_name then discrepancy(modname, "Code  has no canonical name specified.", code) elseif language_names[canonical_name] then discrepancy(modname,				"%s has a canonical name that is not unique; it is also used by the code  .",				link(canonical_name), code, language_names[canonical_name]) else if not m_language_canonical_names[canonical_name] then discrepancy("languages/canonical names", "The canonical name %s is missing.", canonical_name, code) end language_names[canonical_name] = code end if wikidata_item then if not wikidata_item:match '^Q%d+$' then discrepancy(modname,					"%s has a Wikidata item with an invalid form:  .",					canonical_name, code, wikidata_item) end end

check_other_names_aliases_varieties(modname, code, canonical_name, data) if lang_type and not (lang_type == "regular" or lang_type == "reconstructed" or lang_type == "appendix-constructed") then discrepancy(modname, "%s is of an invalid type  .", link(canonical_name), code, data.type) end if data.scripts then check_array(modname, code, data, "scripts") if not data.scripts[1] then discrepancy(modname, "%s has no scripts listed.", link(canonical_name), code) else for _, sccode in ipairs(data.scripts) do					if not m_script_data[sccode] then discrepancy(modname,							"%s lists an invalid script code  .",							link(canonical_name), code, sccode) end nonempty_scripts[sccode] = true end end end if data.ancestors then check_ancestors(modname, code, data, data.ancestors, false) end if data[3] then local family = data[3] if not m_family_data[family] then discrepancy(modname,					"%s has an invalid family code  .",					link(canonical_name), code, family) end nonempty_families[family] = true end if data.sort_key then check_entry_name_or_sortkey(modname, code, data, "sort_key") end if data.entry_name then check_entry_name_or_sortkey(modname, code, data, "entry_name") end

if data.standardChars then validate_pattern(data.standardChars, modname, code, data, true) end if data.override_translit and not data.translit_module then discrepancy(modname,				"%s has   set, but no transliteration module",				link(canonical_name), code) end end -- Check two-letter codes local modname = "languages/data2" local data2 = require("Module:" .. modname) for code, data in pairs(data2) do		if not code:find("^[a-z][a-z]$") then discrepancy(modname, '%s does not have a two-letter code.', link(data.canonicalName), code) end check_language(modname, code, data) end -- Check three-letter codes for i = string.byte('a'), string.byte('z') do		local letter = string.char(i) local modname = "languages/data3/" .. letter local data3 = require("Module:" .. modname) local code_pattern = "^" .. letter .. "[a-z][a-z]$" for code, data in pairs(data3) do			if not code:find(code_pattern) then discrepancy(modname,					'%s does not have a three-letter code starting with " ".',					link(data.canonicalName), code, letter) end check_language(modname, code, data) end end -- Check exceptional codes modname = "languages/datax" local datax = require("Module:" .. modname) for code, data in pairs(datax) do		if code:find("^[a-z][a-z][a-z]?$") then discrepancy(modname, '%s has a two- or three-letter code.', link(data.canonicalName), code) end check_language(modname, code, data) end -- These checks must be done while all_codes only contains language codes: -- that is, after language data modules have been processed, but before -- etymology languages, families, and scripts have. local function check_code_and_name(modname, code, canonical_name) if not all_codes[code] then if not language_names[canonical_name] then discrepancy(modname,					"The code  and the canonical name %s should be removed; they are not found in a submodule of Module:languages.",					code, canonical_name) else discrepancy(modname,					", the code for the canonical name %s, is wrong; it should be .",					code, canonical_name, language_names[canonical_name]) end elseif not language_names[canonical_name] then local data_table = require("Module:" .. all_codes[code])[code] discrepancy(modname,				"%s, the canonical name for the code, is wrong; it should be %s.",				canonical_name, code, data_table[1] or data_table.canonicalName) end end for code, canonical_name in pairs(m_language_codes) do		check_code_and_name("languages/code to canonical name", code, canonical_name) end for canonical_name, code in pairs(m_language_canonical_names) do		check_code_and_name("languages/canonical names", code, canonical_name) end end

local function check_etym_languages local modname = "etymology languages/data" local check_etymology_language_data_keys = check_data_keys{ "canonicalName", "otherNames", "aliases", "varieties", "parent", "wikipedia_article", "wikidata_item", "ancestors" }	local function link(name) if not name then return "???" elseif name:find("[Ll]anguage$") then return name else return name .. " language" end end for code, data in pairs(m_etym_language_data) do		local canonical_name, parent, ancestors = data.canonicalName, data.parent, data.ancestors check_etymology_language_data_keys(modname, code, data) if all_codes[code] then discrepancy(modname, "Code  is not unique; it is also defined in Module:%s.", code, all_codes[code]) else all_codes[code] = modname end if not canonical_name then discrepancy(modname, "Code  has no canonical name specified.", code) elseif language_names[canonical_name] then --[=[			discrepancy(modname,				"%s has a canonical name that is not unique; it is also used by the code  .",				link(data.names[1]), code, language_names[data.names[1]]) --]=]		else language_names[canonical_name] = code end check_other_names_aliases_varieties(modname, code, canonical_name, data) if parent then if type(parent) ~= "string" then discrepancy(modname,					"Etymology-only %s has a parent language or family code that is %s rather than a string.",					link(canonical_name), code, parent == nil and "nil" or "a " .. type(parent)) elseif not (m_language_data[parent] or m_family_data[parent] or m_etym_language_data[parent]) then discrepancy(modname,					"Etymology-only %s has invalid parent language or family code  .",					link(canonical_name), code, parent) end nonempty_families[parent] = true else discrepancy(modname,				"Etymology-only %s has no parent language or family code.",				link(canonical_name), code) end if ancestors then check_ancestors(modname, code, data, ancestors, true) end end

local checked = {} for code, data in pairs(m_etym_language_data) do		local stack = {}

while data do			if checked[data] then break end if stack[data] then discrepancy(modname, "%s has a cyclic parental relationship to %s ",					link(data[1] or data.canonicalName), code,					link(m_etym_language_data[data.parent].canonicalName), data.parent				) break end stack[data] = true code, data = data.parent, data.parent and m_etym_language_data[data.parent] end for data in pairs(stack) do			checked[data] = true end end end

local function check_families local modname = "families/data" local check_family_data_keys = check_data_keys{ "canonicalName", "otherNames", "aliases", "varieties", "family", "protoLanguage", "wikidata_item" }

local function link(name) if not name then return "???" elseif name:find("[Ll]anguages$") then return "" .. name .. " family" else return "" .. name .. " family" end end for code, data in pairs(m_family_data) do		check_family_data_keys(modname, code, data) if all_codes[code] then discrepancy(modname, "Code  is not unique; it is also defined in Module:%s.", code, all_codes[code]) else all_codes[code] = modname end if not data.canonicalName then discrepancy(modname, " has no canonical name specified.", code) elseif family_names[data.canonicalName] then discrepancy(modname,				"%s has a canonical name that is not unique; it is also used by the code  .",				link(data.canonicalName), code, family_names[data.canonicalName]) else family_names[data.canonicalName] = code end check_other_names_aliases_varieties(modname, code, data.canonicalName, data) if data.family then if data.family == code and code ~= "qfa-not" then discrepancy(modname,					"%s has itself as its family.",					link(data.canonicalName), code) elseif not m_family_data[data.family] then discrepancy(modname,					"%s has an invalid parent family code  .",					link(data.canonicalName), code, data.family) end nonempty_families[data.family] = true end end for code, data in pairs(m_family_data) do		if not (nonempty_families[code] or allowed_empty_families[code]) then discrepancy(modname, "%s has no child families or languages.", link(data.canonicalName), code) end end

local checked = { ['qfa-not'] = true } for code, data in pairs(m_family_data) do		local stack = {}

while data do			if checked[code] then break end if stack[code] then discrepancy(modname, "%s has a cyclic parental relationship to %s ",					link(data[1] or data.canonicalName), code,					link(m_family_data[data[3]].canonicalName), data[3]				) break end stack[code] = true code, data = data.family, m_family_data[data[3]] end for code in pairs(stack) do			checked[code] = true end end end

local function check_scripts local modname = "scripts/data" local check_script_data_keys = check_data_keys({		"canonicalName", "otherNames", "aliases", "varieties", "parent",		"systems", "wikipedia_article", "characters", "direction",		"character_category",	}, true) local m_script_codes = require('Module:scripts/code to canonical name') local m_script_canonical_names = require('Module:scripts/by name') for code, data in pairs(m_script_data) do		local canonical_name = data.canonicalName if not m_script_codes[code] and #code == 4 then discrepancy('scripts/code to canonical name', ' (%s) is missing', code, canonical_name) end check_script_data_keys(modname, code, data) if not canonical_name then discrepancy(modname, "Code  has no canonical name specified.", code) elseif script_names[canonical_name] then --[=[			discrepancy(modname,				"%s has a canonical name that is not unique; it is also used by the code  .",				link_script(data.names[1]), code, script_names[data.names[1]]) --]=]		else if not m_script_canonical_names[canonical_name] and #code == 4 then discrepancy('scripts/by name', '%s is missing', canonical_name, code) end script_names[canonical_name] = code end check_other_names_aliases_varieties(modname, code, canonical_name, data) if not nonempty_scripts[code] then discrepancy(modname,				"%s is not used by any language%s.",				link_script(canonical_name), code, data.characters and ""					or " and has no characters listed for auto-detection") --		elseif not data.characters then			discrepancy(modname, "%s has no characters listed for auto-detection.", link_script(canonical_name), code)		-- end

if data.characters then validate_pattern(data.characters, modname, code, data, false) end end end

-- Warning: cannot be called twice in the same module invocation because -- some module-global variables are not reset between calls. function export.do_checks messages = setmetatable({}, {		__index = function (self, k)			local val = Array			self[k] = val			return val		end	}) check_languages check_etym_languages

-- families and scripts must be checked AFTER languages; languages checks fill out -- the nonempty_families and nonempty_scripts tables, used for testing if a family/script -- is ever used in the data check_families check_scripts setmetatable(messages, nil) local function find_code(message) return string.match(message, " ") end find_code = require("Module:fun").memoize(find_code) local function comp(message1, message2) local code1, code2 = find_code(message1), find_code(message2) if code1 and code2 then return code1 < code2 else return message1 < message2 end end for modname, msglist in pairs(messages) do		msglist:sort(comp) end local ret = messages messages = nil return ret end

function export.format_message(modname, msglist) return '===Module:' .. modname .. '===' .. msglist :map(				function(msg)					return "\n* " .. msg				end) :concat end

function export.check_modules(...) local ret = Array local messages = export.do_checks for _, module in ipairs {...} do		local msglist = messages[module] if msglist then ret:insert(export.format_message(module, msglist)) end end return ret:concat("\n") end

function export.check_modules_t(frame) local args = m_table.shallowcopy(frame.args) return export.check_modules(unpack(args)) end

function export.perform(frame) local messages = export.do_checks -- Format the messages local ret = Array for modname, msglist in m_table.sortedPairs(messages) do		ret:insert(export.format_message(modname, msglist)) end -- Are there any messages? if i == 1 then return ' Glory to Arstotzka.' else ret:insert(1, ' Discrepancies detected:') return ret:concat('\n') end end

return export