Module:Zh-usex

local m_zh = require("Module:zh") local find = mw.ustring.find local gsub = mw.ustring.gsub local match = mw.ustring.match local sub = mw.ustring.sub local split = mw.text.split

-- Use this when the actual title needs to be known. local actual_title = mw.title.getCurrentTitle

-- Use this when testcases need to be able to override the title (for bolding, -- for instance). local title = actual_title local PAGENAME = PAGENAME or title.text

local export = {}

local data = mw.loadData("Module:zh-usex/data") local variety_list = data.variety_list local punctuation = data.punctuation local ref_list = data.ref_list local pron_correction = data.pron_correction local polysyllable_pron_correction = data.polysyllable_pron_correction

local zh_format_start_simp = "" local zh_format_start_trad = "" local zh_format_end = " "

local Han_pattern = '[一-鿌㐀-䶵𠀀-𬺯]' local UTF8_char = '[%z\1-\127\194-\244][\128-\191]*' local tag = "%b<>"

local function make_link(word) local orig_word = word word = "" .. word .. "" -- If an entire word is bolded in a link, move the tags out of the wikilink syntax. word = word :gsub("(%[%[)()(.-)()(%]%])", "%2%1%3%5%4") -- Move br tags out of links. :gsub("(%[%[)()", "%2%1") -- Link to Chinese section. -- Remove bolding from link target, leave it in link text. :gsub("%[%[([^|]-)%]%]",			function (word)				return "" .. word .. ""			end) if actual_title.nsText == "Module" then mw.log(orig_word, "->", word) end return word end

function export.show(frame) local params = { [1] = { required = true },	-- example [2] = {},					-- translation [3] = {},					-- variety lit = {}, tr = {},

ref = {}, r = { alias_of = "ref" },

display_type = {}, type = { alias_of = "display_type" },

inline = {},

audio = {}, a = { alias_of = "audio" },

collapsed = { type = "boolean" },

link = { type = "boolean", default = true }, l = { alias_of = "link" },

-- Allow specifying pagename in testcases on documentation page. pagename = actual_title.nsText == "Module" and {} or nil,

tr_nocap = { type = "boolean" }, }

local args, unrecognized_args = require("Module:parameters").process(frame:getParent.args, params, true)

if args.pagename then -- Override title in Module namespace. title = mw.title.new(args.pagename) PAGENAME = title.text end

local example = args[1] or error("Example unspecified.") local translation = args[2] local literal = args["lit"] local reference = args["ref"] local manual_tr = args["tr"] local display = args["display_type"] local inline = args["inline"] local audio_file = args["audio"] local collapsed = args["collapsed"] local phonetic = "" local original_length = mw.ustring.len(gsub(example, "[^一-龯㐀-䶵]", "")) local variety = args[3] or (ref_list[reference] and ref_list[reference][1] or false) or "MSC" local variety_data = variety_list[variety] or error("variety " .. variety .. " not recognized.") local variety_code if variety_data then variety_code = variety_data[2] end

local link = args["link"] -- link = match(link, "n") == nil -- and not (not match(example, " ") and match(example, "[，. ？！﹑]"))

if next(unrecognized_args) then --Special:WhatLinksHere/Template:tracking/zh-usex/unrecognized arg require("Module:debug").track_unrecognized_args(unrecognized_args, "zh-usex") end

if not translation or translation == '' then -- per standard Module:usex translation = ' (please add an English translation of this example) ' end if not match(example, "'''") then boldify = true end

-- automatically boldify pagetitle if nothing is in bold if boldify and not punctuation[PAGENAME] then example = gsub(example, PAGENAME, "" .. PAGENAME .. "") example = gsub(example, "", "") end

-- tidying up the example, making it ready for transcription example = gsub(example, "([？！，. 、“”…；：‘’|（）「」『』—《》〈〉·　．～])", " %1 ") example = gsub(example, " — — ", " —— ") -- double em-dash (to be converted to single em-dash later) example = gsub(example, "^ *", "") example = gsub(example, " *$", "") example = gsub(example, " +", " ") example = gsub(example, "%'%'%'([^%']+)%'%'%'", "%1") example = gsub(example, "(.)%[([^%[%]]+)%]", function(first, second)		return ""..first.."" ~= second and first.."["..second.."]" or first.."["..first.."]" end) example = gsub(example, "({[^{}]+})", "%1")

local ruby_start, ruby_mid, ruby_end = " ", "  ( ", " )</rp>  " local ruby_words = {} local trad_words, simp_words, tr_words = {}, {}, {}

simp_exist = (m_zh.ts_determ(gsub(example, "(.)%[%1%]", "")) == "trad" or (match(example, "%^%[%+%]") and not match(example, "(.)%[%1%]"))) and variety_code ~= "vi" for word in mw.text.gsplit(example, " ", true) do		if gsub(gsub(word, "%{[^%}]+%}", ""), "%.", "") == PAGENAME and boldify then word = "'''" .. word .. "'''"		end local trad_word, simp_word, tr_word, ruby_word = word, false, false, ""

-- various tricks for linking and display in trad. and simp. trad_word = gsub(trad_word, "(.)%[(.)%]", "%1") trad_word = gsub(trad_word, "{[^{}]*}", "") trad_word = gsub(trad_word, "[%^%.]", "") trad_word = gsub(trad_word, "\\", "|") trad_word = gsub(trad_word, "．", ".")

if simp_exist then simp_word = match(word, "%[") and gsub(gsub(word .. "終[终]", "([^%[%]]*).%[(.)%]", function(a, b) return m_zh.ts(a) .. b end), "终$", "") or m_zh.ts(word) simp_word = gsub(simp_word, "{[^{}]*}", "") simp_word = gsub(simp_word, "[%^%.]", "") simp_word = gsub(simp_word, "\\", "|") simp_word = gsub(simp_word, "．", ".") end -- produce links local trad_segments, simp_segments

-- Replace "-" with "🈹" between Han characters (optionally with a HTML		-- tag between the Han character and the hyphen). This would be simpler with -- regex or LPeg. local function replace_hyphen(word) local function helper(pos1, before, pos2, after) if (before == ">"						and find(word:sub(1, pos1), Han_pattern .. tag .. "$")						or find(before, Han_pattern)) and (after == "<"						and find(word:sub(pos2), "^" .. tag .. Han_pattern)						or after == "@"						or find(after, Han_pattern)) then return before .. "🈹" .. after end end if variety_code == "cmn" then word = word:gsub("%-%-(%-?)", "%1") end for i = 1, 2 do word = word:gsub("(" .. UTF8_char .. ")%-(" .. UTF8_char .. ")", helper) -- odd and even positions end return word end trad_segments = split(replace_hyphen(trad_word), '🈹') if simp_exist then simp_segments = split(replace_hyphen(simp_word), '🈹') if #trad_segments ~= #simp_segments then error('trad-to-simp conversion changed the number of hyphens') end end for i, trad_segment in ipairs(trad_segments) do			local contain_pagename = (gsub(gsub(gsub(trad_segment, "</?b>", ""), "%^", ""), "-", "") == PAGENAME) and not punctuation[PAGENAME] if match(trad_segment, "|") or (link and not match(trad_segment, "@") and not punctuation[word] and not contain_pagename) then trad_segments[i] = make_link(trad_segment) if simp_exist then simp_segments[i] = make_link(simp_segments[i]) end end end trad_word = table.concat(trad_segments) simp_word = simp_exist and table.concat(simp_segments) trad_word = gsub(trad_word, "@", "") simp_word = simp_exist and gsub(simp_word, "@", "")

-- same tricks applied to transcription if not manual_tr and (variety_code == "cmn" or variety_code == "yue" or variety_code == "nan" or variety == "H") then if punctuation[word] then tr_word = punctuation[word] else real_word = true local hyphen = variety_code == "nan" or variety_code == "hak" tr_word = gsub(word, "@", "") tr_word = gsub(tr_word, "%.", " ") tr_word = gsub(tr_word, ".+\\", "") tr_word = gsub(tr_word, "%^%[%+%]", "") tr_word = gsub(tr_word, ".</b>(%{[^%}]+%})", "%1</b>") tr_word = gsub(tr_word, "(.){([^{}]*)}",function(a, b)						if hyphen and not mw.ustring.find(a, "[a-zA-Z]") then							return "-" .. b .. "-"						else							return b						end					end) for key,val in pairs(polysyllable_pron_correction[variety_code]) do					tr_word = gsub(tr_word, key, val) end tr_word = gsub(tr_word, ".", pron_correction[variety_code]) if variety_code == "cmn" then tr_word = gsub(tr_word, "%-+", function(s) return mw.ustring.len(s)==1 and '' or '-' end) tr_word = gsub(tr_word, "[^%-]+", m_zh.py) elseif variety_code == "yue" then local m_yue_pron = mw.loadData("Module:zh/data/yue-pron") tr_word = gsub(tr_word, ".", m_yue_pron.jyutping) tr_word = gsub(tr_word, "([a-z])([1-9])(-?)([1-9]?)", "%1%2%3%4 ") elseif hyphen then tr_word = gsub(tr_word, "[一-鿌㐀-䶵　-〿𠀀-𬺯]+", function(text)						if m_zh.check_pron(text, variety_code, 1) then							return gsub(m_zh.check_pron(text, variety_code, 1), "/.+$", "")						else							text = gsub(text, ".", function(ch) if m_zh.check_pron(ch, variety_code, 1) then return gsub(m_zh.check_pron(ch, variety_code, 1), "/.+$", "") .. "-"								else return ch								end end)							return gsub(text, "-$", "")						end					end) tr_word = gsub(tr_word, "%-+", "-") tr_word = gsub(tr_word, "%-([^ⁿa-záíúéóḿńàìùèòǹâîûêôāīūēōṳA-ZÁÍÚÉÓḾŃÀÌÙÈÒǸÂÎÛÊÔĀĪŪĒŌṲ])", "%1") tr_word = gsub(tr_word, "([^ⁿa-záíúéóḿńàìùèòǹâîûêôāīūēōoóòôōṳA-ZÁÍÚÉÓḾŃÀÌÙÈÒǸÂÎÛÊÔĀĪŪĒŌOÓÒÔŌṲ̄̀́̂̍͘])%-", "%1") tr_word = gsub(tr_word, "", "-") tr_word = gsub(tr_word, "</b>", "</b>-") tr_word = gsub(tr_word, "%^%-", "^") tr_word = gsub(tr_word, "^%-+", "") tr_word = gsub(tr_word, "%-+$", "") tr_word = gsub(tr_word, "%%%-?", "--") end if match(tr_word, "[一-鿌㐀-䶵𠀀-𬺯]") then require("Module:debug").track("zh-usex/character without transliteration") end end end if variety_code == "nan" then trad_word = gsub(trad_word, "%%", "") simp_word = simp_exist and gsub(simp_word, "%%", "") end

if display == "ruby" then ruby_word = ruby_start .. trad_word .. (simp_exist and " " .. simp_word or "") .. ruby_mid .. (real_word and tr_word or "") .. ruby_end table.insert(ruby_words, ruby_word) else table.insert(trad_words, trad_word) table.insert(simp_words, simp_word or nil) table.insert(tr_words, tr_word or nil) end end

local tag_start = " <span style=\"color:darkgreen; font-size:x-small;\">&#91;" -- HTML entity since "MSC" is interpreted poorly local tag_end = "&#93; "

if display == "ruby" then local tag = " <rb> " .. tag_start .. variety_data[1] .. (simp_exist						and ", trad.↑ + simp.↓"						or ", trad. and simp.") .. tag_end ..

tag_start .. "rom.: " .. variety_data[3] .. tag_end .. " </rb> "

return table.concat(ruby_words, "") .. tag .. "<dl><dd>''" .. translation .. "''</dd></dl>" else trad_text = gsub(table.concat(trad_words), "([a-zA-Z]%]%])(%[%[[a-zA-Z])", "%1 %2")		simp_text = simp_exist and gsub(table.concat(simp_words), "([a-zA-Z]%]%])(%[%[[a-zA-Z])", "%1 %2") or false		phonetic = manual_tr or (#tr_words > 0 and table.concat(tr_words, " ") or false)

-- overall transcription formatting if phonetic then phonetic = gsub(phonetic, " </b>", "</b> ") phonetic = gsub(phonetic, " ", " ") if variety_code == "yue" or variety_code == "zhx-tai" or variety_code == "zhx-teo" or variety_code == "nan-hai" or variety_code == "cmn-sze" then phonetic = gsub(phonetic, "([a-zê]+)([1-9%-]+)", "%1%2") -- superscript tones end phonetic = gsub(phonetic, " ([,%.?!;:’”)])", "%1") -- remove excess spaces from punctiation			phonetic = gsub(phonetic, "([‘“(]) ", "%1") if not manual_tr then phonetic = gsub(phonetic, "%'([^%'])", "%1") -- allow bolding for manual translit if variety_code == "nan" then phonetic = gsub(phonetic, " +%-%-", "--") end end

-- capitalisation if variety_code == "yue" then args.tr_nocap = '1' end if not args.tr_nocap and match(example, "[. ？！]") then phonetic = "^" .. gsub(phonetic, "([%.?!]) ", "%1 ^") end phonetic = gsub(phonetic, "([%.%?%!][”’]) (.)", "%1 ^%2") phonetic = gsub(phonetic, " (.)", " ^%1") phonetic = gsub(phonetic, ": ([“‘])(.)", ": %1^%2") phonetic = gsub(phonetic, "%^", "^") phonetic = gsub(phonetic, "%^+.", mw.ustring.upper) phonetic = gsub(phonetic, "%^", "")

if variety_code == "wuu" then local wuu_pron = require("Module:wuu-pron") phonetic = "<span class=\"IPA\">[" .. wuu_pron.ipa_conv(phonetic) .. "] "

elseif variety_code == "cmn-wuh" then phonetic = "<span class=\"IPA\">[" .. phonetic .. "] "

elseif variety_code == "cdo" then local cdo_pron = require("Module:cdo-pron") phonetic = "''" .. phonetic .. "''" ..					(not match(phonetic, "-[^ ]+-[^ ]+-[^ ]+-")						and " / <span class=\"IPA\"> [" .. cdo_pron.sentence(phonetic) .. "] "						or "")

else phonetic = "''" .. phonetic .. "''"			end phonetic = "<span style=\"color:#404D52\">" .. phonetic .. " "		end end

local collapse_start, collapse_end, collapse_tag, collapse_border_div, collapse_border_div_end = , , , , '' simplified_start = ' ' if collapsed then collapse_start = ' ' collapse_end = ' ' collapse_tag = ' ' collapse_border_div = '<div class="vsSwitcher" data-toggle-category="usage examples" style="border-left: 1px solid #930; border-left-width: 2px; padding-left: 0.8em;">' collapse_border_div_end = ' ' simplified_start = ' ' end

if actual_title.nsText == '' then -- fixme: probably categorize only if text contains the actual word if reference then cat = "" else cat = "" end end

-- indentation, font and identity tags if ((variety_code == "cmn" and original_length > 7)			or (variety_code ~= "cmn" and original_length > 5)			or reference			or (match(example, "[，. ？！、：；　]") and variety_code == "wuu")			or (variety_code == "cdo" and original_length > 3)			or (inline or "" ~= "")) then

trad_text = zh_format_start_trad .. trad_text .. zh_format_end

if not phonetic then translation = "''" .. translation .. "''"		end

if phonetic then phonetic = "<dd>" .. collapse_start .. phonetic translation = "<dd>" .. translation .. "</dd>" tr_tag = tag_start .. variety_data[3] .. tag_end .. collapse_end .. "</dd>" else translation = "<dd>" .. translation .. "</dd>" end

if audio_file then audio = "<dd></dd>" end

trad_tag = collapse_start .. tag_start .. variety_data[1] .. ", <i>trad." .. ((simp_exist or variety_code == "vi") and "" or " and simp.") .. "</i>" .. tag_end .. collapse_end .. collapse_tag

if simp_exist then simp_text = simplified_start .. collapse_start .. zh_format_start_simp .. simp_text .. zh_format_end simp_tag = tag_start .. variety_data[1] .. ", simp." .. tag_end .. collapse_end end

if reference then reference = "<dd>" .. collapse_start .. " From: " .. (ref_list[reference] and ref_list[reference][2] or reference) .. " " .. collapse_end .. "</dd>" end

return collapse_border_div .. "<dl class=\"zhusex\">" .. trad_text .. trad_tag .. (simp_text or "") .. (simp_tag or "") .. (reference or "") .. (phonetic and phonetic .. tr_tag or "") .. (audio or "") .. translation .. "</dl>" .. (cat or "") .. collapse_border_div_end

else trad_text = zh_format_start_trad .. trad_text .. zh_format_end divider = " ―  "

if variety ~= "MSC" then ts_tag = tag_start .. variety_data[1] .. tag_end tr_tag = tag_start .. variety_data[3] .. tag_end end

if not phonetic then translation = "''" .. translation .. "''"		end

if simp_exist then simp_text = " / " .. zh_format_start_simp .. simp_text .. zh_format_end end

if audio_file then audio = " " end

return trad_text .. (simp_text or "") .. (ts_tag or "") .. divider .. (phonetic and phonetic .. (tr_tag or "") .. (audio or "") .. divider or "") .. translation .. (literal and " (literally, “" .. literal .. "”)" or "") .. (cat or "") end end

function export.migrate(text, translation, ref) if type(text) == "table" then if not text.args or not text.args[1] then text = text:getParent end if text.args[2] and text.args[2] ~= '' then ref = text.args[1] translation = text.args[3] text = text.args[2] else text = text.args[1] end end text = text:gsub('^[%*#: \n]+', ):gsub('[ \n]+$', ):gsub(' +', '　'):gsub('\n+', ' '):gsub('|', '\\'):gsub('\'\'\'%[%[', ' '):gsub('%]%]\'\'\, ' '):gsub('%]%]%[%[', ' '):gsub('%]%]', ):gsub('%[%[', '')
 * gsub('\'\'\, ):gsub(',', '，'):gsub('!', '！'):gsub('%?', '？')

if translation then if ref and ref ~= '' then return '' else return '' end else return text end end

return export