Permanently protected module
From Wikipedia, the free encyclopedia


local U = mw.ustring.char



-- Diacritics, from the [[Combining Diacritical Marks]] block.

local grave        = U(0x300)

local acute        = U(0x301)

local circumflex   = U(0x302)

local tilde        = U(0x303)

local macron       = U(0x304)

local breve        = U(0x306)

local dot          = U(0x307)

local diaeresis    = U(0x308)

local double_acute = U(0x30B)

local caron        = U(0x30C)

local double_grave = U(0x30F)

local invbreve     = U(0x311)

local dot_below    = U(0x323)

local undertie     = U(0x35C)



--[[

	

	This is a table of Wiktionary language codes with data belonging to them.

	Name is the "canonical name" used on Wiktionary.

	Article is the Wikipedia article.

	Script is the ISO 15924 code.

]]

local data = {

	"languages" = {

		"aaq" = {

			"name" = "Penobscot",

		},

		"ab" = {

			"name" = "Abkhaz",

		},

		"abe" = {

			"name" = "Abenaki",

		},

		"ang" = {

			"name" = "Old English",

			"article" = {"Old English"},

			-- Remove macrons, acutes, and overdots

			"replacements" = {

				decompose = true,

				from = { "[" .. macron .. acute .. dot .. "]" },

			},

		},

		"ar" = {

			"name" = "Arabic",

			"article" = "Arabic language",

			"direction" = "rtl", -- Should be in the script data module.

			"replacements" = {

				-- ālif with wasla is replaced by ālif;

				U(0x0671)] = U(0x0627),

				-- taṭwīl, fatḥatan, ḍammatan, kasratan,

				-- fatḥa, ḍamma, kasra,

				-- shadda, sukūn, and superscript (dagger) ālif are removed.

				"["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

					..U(0x064E)..U(0x064F)..U(0x0650)

					..U(0x0651)..U(0x0652)..U(0x0670).."]" = "",

			},

		},

		"ara" = {

			"name" = "Arabic",

			"article" = "Arabic language",

			"direction" = "rtl", -- Should be in the script data module.

			"replacements" = {

				-- ālif with wasla is replaced by ālif;

				U(0x0671)] = U(0x0627),

				-- taṭwīl, fatḥatan, ḍammatan, kasratan,

				-- fatḥa, ḍamma, kasra,

				-- shadda, sukūn, and superscript (dagger) ālif are removed.

				"["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

					..U(0x064E)..U(0x064F)..U(0x0650)

					..U(0x0651)..U(0x0652)..U(0x0670).."]" = "",

			},

		},

		"arb" = {

			"name" = "Modern Standard Arabic",

			"article" = "Modern Standard Arabic",

			"direction" = "rtl", -- Should be in the script data module.

			"replacements" = {

				-- ālif with wasla is replaced by ālif;

				U(0x0671)] = U(0x0627),

				-- taṭwīl, fatḥatan, ḍammatan, kasratan,

				-- fatḥa, ḍamma, kasra,

				-- shadda, sukūn, and superscript (dagger) ālif are removed.

				"["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

					..U(0x064E)..U(0x064F)..U(0x0650)

					..U(0x0651)..U(0x0652)..U(0x0670).."]" = "",

			},

		},

		"apc" = {

			"name" = "North Levantine Arabic",

			"article" = "North Levantine Arabic",

			"direction" = "rtl", -- Should be in the script data module.

			"replacements" = {

				-- ālif with wasla is replaced by ālif;

				U(0x0671)] = U(0x0627),

				-- taṭwīl, fatḥatan, ḍammatan, kasratan,

				-- fatḥa, ḍamma, kasra,

				-- shadda, sukūn, and superscript (dagger) ālif are removed.

				"["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

					..U(0x064E)..U(0x064F)..U(0x0650)

					..U(0x0651)..U(0x0652)..U(0x0670).."]" = "",

			},

		},

		"ajp" = {

			"name" = "South Levantine Arabic",

			"article" = "South Levantine Arabic",

			"direction" = "rtl", -- Should be in the script data module.

			"replacements" = {

				-- ālif with wasla is replaced by ālif;

				U(0x0671)] = U(0x0627),

				-- taṭwīl, fatḥatan, ḍammatan, kasratan,

				-- fatḥa, ḍamma, kasra,

				-- shadda, sukūn, and superscript (dagger) ālif are removed.

				"["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

					..U(0x064E)..U(0x064F)..U(0x0650)

					..U(0x0651)..U(0x0652)..U(0x0670).."]" = "",

			},

		},

		"arz" = {

			"name" = "Egyptian Arabic",

			"article" = "Egyptian Arabic",

			"direction" = "rtl", -- Should be in the script data module.

			"replacements" = {

				-- ālif with wasla is replaced by ālif;

				U(0x0671)] = U(0x0627),

				-- taṭwīl, fatḥatan, ḍammatan, kasratan,

				-- fatḥa, ḍamma, kasra,

				-- shadda, sukūn, and superscript (dagger) ālif are removed.

				"["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

					..U(0x064E)..U(0x064F)..U(0x0650)

					..U(0x0651)..U(0x0652)..U(0x0670).."]" = "",

			},

		},

		"av" = {

			"name" = "Avar"

		},

		"be" = {

			"article" = "Belarusian language",

			"replacements" = { acute = "", },

		},

		"bn" = {

			"name" = "Bengali",

			"article" = "Bengali language",

		},

		"bua" = {

			"name" = "Buryat",

		},

		"cel-pro" = {							-- Incorrect tag

			"name" = "Proto-Celtic",

			"Wikipedia_code" = "cel-x-proto",

		},

		"cel-x-proto" = {

			"name" = "Proto-Celtic",

		},

		"cel-bry-pro" = {						-- Incorrect tag

			"name" = "Proto-Brythonic",

			"article" = "Common Brittonic",

			"type" = "reconstructed",

		},

	    "com" = {

	    	"name" = "Comanche",

	    	"article" = "Comanche language",

	    },

		"cu" = {

			"name" = "Old Church Slavonic",

			"article" = "Old Church Slavonic",

		},

		"de" = {

			"name" = "German",

			"article" = "German language",

		},

		"en" = {

			"name" = "English",

			"article" = "English language",

		},

		"es" = {

			"name" = "Spanish",

			"article" = "Spanish language",

		},

		"egy" = {

			"name" = "Egyptian",

		},

		"evn" = {

			"name" = "Evenki",

			"article" = "Evenki language",

		},

		"fr" = {

			"name" = "French",

			"article" = "French language",

		},

		"frm" = {

			"name" = "Middle French",

			"article" = "Middle French",

		},

		"frp" = {

			"name" = "Franco-Provençal",

		},

		"ff" = {

			"name" = "Fula",

		},

		"gem-pro" = {							-- Incorrect tag

			"name" = "Proto-Germanic",

			"article" = "Proto-Germanic language",

			"type" = "reconstructed",

			"replacements" = {},

			"Wikipedia_code" = "gem-x-proto",

		},

		"gem-x-proto" = {

			"name" = "Proto-Germanic",

			"article" = "Proto-Germanic language",

			"type" = "reconstructed",

			"replacements" = {},

		},

		"gml" = {

			"name" = "Middle Low German",

		},

		"gmw-ecg" = {

			"name" = "East Central German",

		},

		"gmw-x-proto" = {

			"name" = "Proto-West Germanic",

			"article" = "Proto-West Germanic language",

			"type" = "reconstructed",

			"replacements" = {},

		},

		"gmq-x-gut" = {

			"name" = "Gutnish",

			"article" = "Gutnish",

		},

		"goh" = {

			"replacements" = {

				decompose = true,

				from = {

					"[" .. macron .. circumflex .. diaeresis .. "]",

				},

			},

		},

		"got" = {

			"name" = "Gothic",

			"article" = "Gothic language",

			"replacements" = {

				-- Latin to Gothic since people will not want to have to copy

				-- and paste Gothic letters in

				"[AÁaáĀā]" = "𐌰",

				"[Bb]"     = "𐌱",

				"[Gg]"     = "𐌲",

				"[Dd]"     = "𐌳",

				"[EeĒē]"   = "𐌴",

				"[Qq]"     = "𐌵",

				"[Zz]"     = "𐌶",

				"[Hh]"     = "𐌷",

				"[Þþ]"     = "𐌸",

				"[IiÍí]"   = "𐌹",

				"[Kk]"     = "𐌺",

				"[Ll]"     = "𐌻",

				"[Mm]"     = "𐌼",

				"[Nn]"     = "𐌽",

				"[Jj]"     = "𐌾",		

				"[UuÚúŪū]" = "𐌿",	

				"[Pp]"     = "𐍀",		

				"[Rr]"     = "𐍂",	

				"[Ss]"     = "𐍃",	

				"[Tt]"     = "𐍄",	

				"[WwYy]"   = "𐍅",

				"[Ff]"     = "𐍆",

				"[Xx]"     = "𐍇",

				"[Ƕƕ]"    = "𐍈", -- Not sure if "hw" and "hv" can safely be converted

				"[OoŌō]"   = "𐍉",

			},

		},

		"gsw" = {

			"name" = "Alemannic German",

		},

		"grc" = {

			"name" = "Ancient Greek",

			"article" = "Ancient Greek",

			"replacements" = {

				decompose = true,

				from = {

					-- Replace variant letterforms with standard ones.

					"ϐ", "ϵ", "ϑ", "ϰ", "ϱ", "ϲ", "ϕ",

					-- Remove macrons and breves.

					"[" .. macron .. breve .. undertie .. "]"

				},

				to   = {

					"β", "ε", "θ", "κ", "ρ", "σ", "φ",

				}

			},

		},

		"grk-pro" = {							-- Incorrect tag

			"name" = "Proto-Hellenic",

			"Wikipedia_name" = "Proto-Greek",

			"article" = "Proto-Greek language",

			"type" = "reconstructed",

			"replacements" = {},

			"Wikipedia_code" = "grk-x-proto",

		},

		"grk-x-proto" = {

			"name" = "Proto-Hellenic",

			"Wikipedia_name" = "Proto-Greek",

			"article" = "Proto-Greek language",

			"type" = "reconstructed",

			"replacements" = {},

		},

		"grt" = {

			"name" = "Garo",

		},

		"ha" = {

			"name" = "Hausa",

			-- remove tilde, grave, acute, macron, circumflex

			"replacements" = {

				decompose = true,

				from = { "[" .. grave .. circumflex .. macron .. acute .. tilde .. "]" },

			},

		},

		"hi" = {

			"name" = "Hindi",

			"article" = "Hindi",

		},

		"ine-bsl-pro" = {

			"name" = "Proto-Balto-Slavic",

			"article" = "Proto-Balto-Slavic language",

			"type" = "reconstructed",

		},

		"ine-pro" = {							-- Incorrect tag

			"name" = "Proto-Indo-European",

			"article" = "Proto-Indo-European language",

			"type" = "reconstructed",

			"replacements" = {},

			"Wikipedia_code" = "ine-x-proto",

		},

		"ine-x-proto" = {

			"name" = "Proto-Indo-European",

			"article" = "Proto-Indo-European language",

			"type" = "reconstructed",

			"replacements" = {},

		},

		"ja" = {

			"name" = "Japanese",

			"article" = "Japanese language",

		},

		"jbo" = { -- Lojban

			"type" = "appendix",

		},

		"ket" = {

			"name" = "Ket",

			"article" = "Ket language",

		},

		"ksk" = {

			"name" = "Kansa",

			"article" = "Kansa language",

		},

		"la" = {

			"name" = "Latin",

			"article" = "Latin",

			"replacements" = {

				decompose = true,

				from = { "[" .. macron .. breve .. diaeresis .. "]" },

			},

		},

		"lt" = {

			"name" = "Lithuanian",

			-- remove acute, tilde, grave

			"replacements" = {

				decompose = true,

				from = { "[" .. acute .. tilde .. grave .. "]" },

			},

		},

		"moe" = {

			"name" = "Cree",

		},

		"mul" = {

			"name" = "Translingual",

			"article" = "",

		},

		"nci" = {

			"name" = "Classical Nahuatl",

			"article" = "Classical Nahuatl",

			-- Remove macrons, acutes, circumflexes and graves

			"replacements" = {

				decompose = true,

				-- Remove macrons, acutes, circumflexes, graves, and saltillo;

				-- see [[Saltillo (linguistics)]].

				from = { "[" .. grave .. acute .. macron .. circumflex .. "Ꞌꞌʻʼ'ʔ]" },

			},

		},

		"nds-de" = {

			"name" = "German Low German",

		},

		"non" = {

			"name" = "Old Norse",

		},

		"non-x-proto" = {

			"name" = "Proto-Norse",

		},

		"odt" = {

			"name" = "Old Dutch",

		},

		"oge" = {

			"name" = "Old Georgian",

		},

		"oj" = {

			"name" = "Ojibwe",

		},

		"orv" = {

			"name" = "Old East Slavic",

			"article" = "Old East Slavic",

			"replacements" = {

				U(0x484)] = "",

			},

		},

		"osx" = {

			"name" = "Old Saxon",

		},

		"pt" = {

			"name" = "Portuguese",

			"article" = "Portuguese language",

			-- ["scripts"] = { "Latn" },

		},

		"pa" = {

			"name" = "Punjabi",

			"article" = "Punjabi language",

		},

		"pgl" = {

			"name" = "Primitive Irish",

			"article" = "Primitive Irish",

		},

		"pis" = {

			"name" = "Pijin",

			"article" = "Pijin language",

		},

		"poz-x-poly-proto" = {

			"name" = "Proto-Nuclear Polynesian",

			"article" = "Proto-Polynesian language",

			"type" = "reconstructed",

		},

		"rap" = {

			"name" = "Rapa Nui",

			"article" = "Rapa Nui language",

		},

		"ru" = {

			"name" = "Russian",

			"article" = "Russian language",

			"replacements" = { acute = "", },

		},

		"rw" = {

			"name" = "Rwanda-Rundi",

		},

		"se" = {

			"replacements" = {

				"([đflmnŋrsšŧv])'%1" = "%1%1",

			},

		},

		"sem-pro" = {

			"name" = "Proto-Semitic",

			"article" = "Proto-Semitic",

			"type" = "reconstructed",

		},

		"sh" = {

			"article" = "Serbo-Croatian language",

			"replacements" = {

				decompose = true,

				from =  { "([AaEeIiOoUuRrАаЕеИиОоУуРр])[" .. double_grave

					.. grave .. invbreve .. acute .. macron .. tilde .. "]" },

				to   = { "%1" },

			},

		},

		"sl" = {

			"name" = "Slovene",

			"replacements" = {

				decompose = true,

				-- remove tonal orthography

				from = {"ł", "[" .. grave .. acute .. macron .. double_grave .. invbreve .. circumflex .. dot_below .. "]"},

				to = {"l"},

			},

		},

		"sla-pro" = {

			"name" = "Proto-Slavic", -- also Common Slavic

			"type" = "reconstructed",

			"replacements" = {

				"[ÀÁÃĀȀȂ]" = "A",

				"[àáãāȁȃ]" = "a",

				"[ÈÉẼĒȄȆ]" = "E",

				"[èéẽēȅȇ]" = "e",

				"[ÌÍĨĪȈȊ]" = "I",

				"[ìíĩīȉȋ]" = "i",

				"[ÒÓÕŌȌȎŐ]" = "O", 

				"[òóõōȍȏő]" = "o",

				"[ÙÚŨŪȔȖŰ]" = "U",

				"[ùúũūȕȗű]" = "u",

				"[ỲÝỸȲ]" = "Y",

				"[ỳýỹȳ]" = "y",

				"Ǭ" = "Ǫ",

				"ǭ" = "ǫ",

				"[" .. grave .. acute .. double_acute .. tilde .. macron .. double_grave .. invbreve .. "]" = "",

				"ĭ" = "ь",

				"ŭ" = "ъ",

			},

		},

		"tts" = {

			"name" = "Isan", -- also "Northeastern Thai"

			"article" = "Isan language",

		},

		"ug" = {

			"name" = "Uyghur", --also less commonly "Uighur"

			"article" = "Uyghur language",

		},

		"uk" = {

			"article" = "Ukrainian language",

			"replacements" = { acute = "", }

		},

		"ur" = {

			"name" = "Urdu",

			"article" = "Urdu",

		},

		"xcl" = {

			"name" = "Old Armenian",

			"article" = "Classical Armenian",

			"replacements" = {

				"[՞՜՛՟]" = "",

				"և" = "եւ",

			},

		},

		"xgf" = {

			"name" = "Tongva", -- not ISO name "Gabrielino-Fernandeño"

			"article" = "Tongva language",

			"replacements" = {

				"['`ʔ]" = "ʼ",

			},

		},

		"xlu" = {

			"name" = "Luwian", -- not ISO name "Cuneiform Luwian"

			"article" = "Cuneiform Luwian"

		},

		"xpq" = {

			"name" = "Mohegan-Pequot",

		},

		"xxt" = {

			"name" = "Tambora",

			"article" = "Tambora language",

		},

		"xvn" = {

			"name" = "Vandalic",

			"article" = "Vandalic language",

		},

		"yua" = {

			"name" = "Yucatec Maya",

			"article" = "Yucatec Maya language",

		},

		"zh" = {

			"name" = "Chinese",

			"article" = "Chinese language",

			-- ["scripts"] = { "Hani" },

		},

	},



-- Here, keys (for example, "gem") are Wikipedia language codes used in

-- {{lang}}, and values (for example, "gem-pro") are the equivalent Wiktionary

-- code.

-- Subtags are not currently supported.

	"redirects" = {

		"aae" = "sq",

		"aiq" = "fa",

		"aln" = "sq",

		"als" = "sq",

		"azb" = "az",

		"azj" = "az",

		"bgn" = "bal",

		"bs" = "sh",

		"bxr" = "bua",

		"ciw" = "oj",

		"cnr" = "sh",

		"fil" = "tl",

		"fuf" = "ff",

		"gem" = "gem-pro", -- Not correct, but is commonly used.

		"hak" = "zh",

		"hbo" = "he",

		"hr" = "sh",

		"ine" = "ine-pro", -- Not correct, but might be commonly used.

		"kjv" = "sh",

		"nan" = "zh",

		"prs" = "fa",

		"rn" = "rw",

		"sli" = "gmw-ecg",

		"sr" = "sh",

		"src" = "sc",

		"sro" = "sc",

		"tw" = "ak",

		"wae" = "gsw",

		"wep" = "nds-de",

		"yue" = "zh",

		"xno" = "fro",

	},

}



return data