Módulo:Unicode data/tests
La documentación para este módulo puede ser creada en Módulo:Unicode data/tests/doc
local p = require "Module:UnitTests/zona de pruebas" local Unicode_data = require(mw.title.getCurrentTitle().subpageText == "zona de pruebas" and "Module:Unicode data/zona de pruebas" or "Module:Unicode data") local U = mw.ustring.char local get_codepoint = mw.ustring.codepoint local function show(codepoint) if Unicode_data.is_printable(codepoint) then local printed_codepoint = U(codepoint) if mw.ustring.toNFC(printed_codepoint) ~= printed_codepoint then printed_codepoint = ("&#x%X;"):format(codepoint) end if Unicode_data.is_combining(codepoint) then printed_codepoint = "◌" .. printed_codepoint end return ("U+%04X: %s"):format(codepoint, printed_codepoint) else return ("U+%04X"):format(codepoint) end end local function show_codepoint_and_name(codepoint) return ("%s (%s)"):format(show(codepoint), Unicode_data.lookup_name(codepoint)) end function p:test_lookup_name() local examples = { { 0x0000, "<control-0000>" }, { 0x007F, "<control-007F>" }, { 0x00C1, "LATIN CAPITAL LETTER A WITH ACUTE" }, { 0x0300, "COMBINING GRAVE ACCENT" }, { 0x0378, "<reserved-0378>" }, { 0x1B44, "BALINESE ADEG ADEG" }, { 0x1F71, "GREEK SMALL LETTER ALPHA WITH OXIA" }, { 0x3555, "CJK UNIFIED IDEOGRAPH-3555" }, { 0xAC01, "HANGUL SYLLABLE GAG" }, { 0xD5FF, "HANGUL SYLLABLE HEH" }, { 0xDC00, "<surrogate-DC00>", }, { 0xEEEE, "<private-use-EEEE>" }, { 0xFDD1, "<noncharacter-FDD1>", }, { 0xFFFD, "REPLACEMENT CHARACTER" }, { 0xFFFF, "<noncharacter-FFFF>" }, { 0x1F4A9, "PILE OF POO" }, { 0xE0000, "<reserved-E0000>" }, { 0xF0F0F, "<private-use-F0F0F>" }, { 0x10FFFF, "<noncharacter-10FFFF>" }, } self:iterate(examples, function (self, codepoint, name) self:equals(show(codepoint), Unicode_data.lookup_name(codepoint), name) end) end function p:test_lookup_age() local examples = { { 0x0061, "1.1" }, { 0x0378, "NA" }, { 0x1B44, "5.0" }, { 0x3555, "3.0" }, { 0xAC01, "2.0" }, { 0xDC00, "2.0", }, { 0xEEEE, "1.1" }, { 0xFDD1, "3.1", }, { 0x1F4A9, "6.0" }, { 0xE0000, "NA" }, { 0xF0F0F, "2.0" }, { 0x10FFFF, "2.0" }, } self:iterate(examples, function (self, codepoint, age) -- Remove pcall when this function is added to [[Module:Unicode data]]. pcall(function () self:equals(show(codepoint), Unicode_data.lookup_age(codepoint), age) end) end) end function p:test_is_combining() local examples = { { 0x0300, true }, { 0x0060, false }, } self:iterate(examples, function (self, codepoint, expected) self:equals( show_codepoint_and_name(codepoint), Unicode_data.is_combining(codepoint), expected) end) end function p:test_is_default_ignorable() local examples = { { 0x0061, false }, { 0x00AD, true }, } self:iterate(examples, function (self, codepoint, expected) -- Remove pcall when this function is added to [[Module:Unicode data]]. pcall(function () self:equals( show_codepoint_and_name(codepoint), Unicode_data.is_default_ignorable(codepoint), expected) end) end) end function p:test_lookup_script() local examples = { { 0x0061, "Latn" }, { 0x002F, "Zyyy" }, { 0x0300, "Zinh" }, { 0x0378, "Zzzz" }, { 0x0398, "Grek" }, { 0x03E2, "Copt" }, { 0x2014, "Zyyy" }, } self:iterate(examples, function (self, codepoint, expected) self:equals( show_codepoint_and_name(codepoint), Unicode_data.lookup_script(codepoint), expected) end) end function p:test_lookup_category() local examples = { { get_codepoint "\t", "Cc" }, { get_codepoint " ", "Zs" }, { get_codepoint "[", "Ps" }, { get_codepoint "]", "Pe" }, { get_codepoint "^", "Sk" }, { get_codepoint "A", "Lu" }, { 0x00AD, "Cf" }, { get_codepoint "¾", "No" }, { get_codepoint "«", "Pi" }, { get_codepoint "»", "Pf" }, { 0x0300, "Mn" }, { 0x0488, "Me" }, { get_codepoint "٣", "Nd" }, { get_codepoint "子", "Lo" }, { get_codepoint "ᾮ", "Lt" }, { 0x1B44, "Mc" }, { get_codepoint "∈", "Sm" }, { get_codepoint "‿", "Pc" }, { get_codepoint "↹", "So" }, { get_codepoint "⸗", "Pd" }, { get_codepoint "Ⅷ", "Nl" }, { 0x2028, "Zl" }, { 0x2029, "Zp" }, { get_codepoint "ゞ", "Lm" }, { 0xD800, "Cs" }, { get_codepoint "£", "Sc" }, { 0xFFFF, "Cn" }, { 0x100000, "Co" }, } self:iterate(examples, function (self, codepoint, expected) self:equals( show_codepoint_and_name(codepoint), Unicode_data.lookup_category(codepoint), expected) end) end local fun = require "Module:Fun" local m_table = require "Module:TableTools" local script_to_count_mt = { __index = function (self, key) self[key] = 0 return 0 end, __call = function (self, ...) return setmetatable({}, self) end } setmetatable(script_to_count_mt, script_to_count_mt) local script_counts = setmetatable({}, { __index = function (self, str) if type(str) ~= "string" then return nil end local script_to_count = script_to_count_mt() for codepoint in mw.ustring.gcodepoint(str) do local script = Unicode_data.lookup_script(codepoint) script_to_count[script] = script_to_count[script] + 1 end local printed = table.concat( fun.mapIter( function (count, script) return ("%s (%d)"):format(script, count) end, m_table.sortedPairs( script_to_count, function (script1, script2) return script_to_count[script1] > script_to_count[script2] end)), ", ") self[str] = printed return printed end, }) local script_examples = { -- To demonstrate that "is_Latin" doesn't treat a string of Zyyy and Zinh -- characters as Latn. -- This particular example only has characters below U+0340, so -- lookup_script doesn't have to be called. { "%!?́", nil }, { "’ʼ“”†‡•‰′‽⁕", nil }, { "col·legi", "Latn" }, "HTML character references", { "𐘀", "Lina" }, { "𐘀", "Lina" }, { "–", nil }, { "–", nil }, -- Examples from [[Template talk:Lang#Italicisation of Halkomelem]] "Halkomelem", { "lá:yelhp", "Latn" }, { "xʷməθkʷəy̓əm", nil }, -- one Greek (Grek) character { "hən̓q̓əmin̓əm̓", "Latn" }, "Quotes", -- [[s:it:Divina Commedia/Inferno/Canto I]] { [[Tant’è amara che poco è più morte; ma per trattar del ben ch’i’ vi trovai, dirò de l’altre cose ch’i’ v’ ho scorte.]], "Latn" }, { -- A blessing in Navajo: --[[User talk:Stephen G. Brown/text8]] [[Díí Késhmish biyiʼ yáʼąąshdę́ę́ʼ ląʼígóó bee nikʼihojidlíi dooleeł. Niheechʼínáánáháhígíí biyiʼ iłhodeezyéél, iłhózhǫ́, ayóóʼóʼóʼní bee nikʼihojidlíi dooleeł. Tʼáá sahdiigiʼ átʼéego baa hózhǫ́ǫgo nihił hanááhoolzhiizhígí biyiʼ tʼáá ałtsojįʼ iłhózhǫ́ nííʼ dooleeł.]], "Latn" }, { -- The opening of the Iliad ([[s:el:Ιλιάς/Α]]), with macrons and -- breves added to mark the length of the monophthongs α, ι, υ: [[Μῆνῐν ᾰ̓́ειδε, θεᾱ́, Πηληῐ̈ᾰ́δεω Ᾰ̓χῐλῆος οὐλομένην, ἣ μῡρῐ́᾽ Ᾰ̓χαιοῖς ᾰ̓́λγε᾽ ἔθηκε, πολλᾱ̀ς δ᾽ ῐ̓φθῑ́μους ψῡχᾱ̀ς Ἄῐ̈δῐ προῐ̈́ᾰψεν ἡρώων, αὐτοὺς δὲ ἑλώρῐᾰ τεῦχε κῠ́νεσσιν οἰωνοῖσῐ́ τε πᾶσῐ· Δῐὸς δ᾽ ἐτελείετο βουλή·]], "Grek" }, { -- The Brothers Karamazov: [[w:ru:Братья Карамазовы (Достоевский)/Книга первая]] [[Вот если вы не согласитесь с этим последним тезисом и ответите: «Не так» или «не всегда так», то я, пожалуй, и ободрюсь духом насчет значения героя моего Алексея Федоровича. Ибо не только чудак «не всегда» частность и обособление, а напротив, бывает так, что он-то, пожалуй, и носит в себе иной раз сердцевину целого, а остальные люди его эпохи — все, каким-нибудь наплывным ветром, на время почему-то от него оторвались…]], "Cyrl" }, { -- Rig Veda: [[https://sa.wikisource.org/wiki/ऋग्वेदः_सूक्तं_१.१]] [[ॐ अग्निमीळे पुरोहितं यज्ञस्य देवमृत्विजम् । होतारं रत्नधातमम् ॥१॥ अग्निः पूर्वेभिरृषिभिरीड्यो नूतनैरुत । स देवाँ एह वक्षति ॥२॥ अग्निना रयिमश्नवत् पोषमेव दिवेदिवे । यशसं वीरवत्तमम् ॥३॥ अग्ने यं यज्ञमध्वरं विश्वतः परिभूरसि । स इद्देवेषु गच्छति ॥४॥ अग्निर्होता कविक्रतुः सत्यश्चित्रश्रवस्तमः । देवो देवेभिरा गमत् ॥५॥ यदङ्ग दाशुषे त्वमग्ने भद्रं करिष्यसि । तवेत् तत् सत्यमङ्गिरः ॥६॥ उप त्वाग्ने दिवेदिवे दोषावस्तर्धिया वयम् । नमो भरन्त एमसि ॥७॥ राजन्तमध्वराणां गोपामृतस्य दीदिविम् । वर्धमानं स्वे दमे ॥८॥ स नः पितेव सूनवेऽग्ने सूपायनो भव । सचस्वा नः स्वस्तये ॥९॥]], "Deva" }, } local ends_in_punctuation = setmetatable({}, { __index = function (self, key) local val = mw.ustring.match(mw.ustring.sub(key, -1), "%p") ~= nil self[key] = val return val end, }) local function show_script_example(script_example) local separator = ": " -- If last character is punctuation, place script counts on their own line -- Could use Unicode_data.lookup_category, but that is more memory-intensive. if ends_in_punctuation[script_example] then separator = "<br>• " end return script_example:gsub('\n', '<br>') .. separator .. script_counts[script_example] end function p:test_get_best_script() self:iterate(script_examples, function (self, str, expected) self:equals( show_script_example(str), Unicode_data.get_best_script(str), expected) end) end function p:test_is_Latin() self:iterate(script_examples, function (self, str, best_script, is_Latin) self:equals(show_script_example(str), Unicode_data.is_Latin(str), is_Latin or best_script == "Latn") end) end function p:test_lookup_block() local examples = { { 0x0064, "Basic Latin" }, { 0x030B, "Combining Diacritical Marks" }, { 0x03A3, "Greek and Coptic" }, { 0x0411, "Cyrillic" }, { 0x10E6, "Georgian" }, { 0x3175, "Hangul Compatibility Jamo" }, { 0xAC01, "Hangul Syllables" }, { 0x4E0A, "CJK Unified Ideographs" }, { 0x1F608, "Emoticons" }, { 0x30000, "No Block" }, { 0x10FFFF, "Supplementary Private Use Area-B" }, } self:iterate(examples, function (self, codepoint, block_name) self:equals( show(codepoint), Unicode_data.lookup_block(codepoint), block_name) end) end function p:test_is_rtl() local examples = { { "أبو عبد الله محمد بن عبد الله اللواتي الطنجي بن بطوطة", true }, -- Ibn Battuta's full name { "أدب القاضي Adab al-qādī", false }, -- Example of incorrect input { "ܛܘܼܒܲܝܗܘܿܢ ܠܐܲܝܠܹܝܢ ܕܲܕ݂ܟܹܝܢ ܒܠܸܒ̇ܗܘܿܢ܄ ܕܗܸܢ݂ܘܿܢ ܢܸܚܙܘܿܢ ܠܐܲܠܵܗܵܐ܂", true }, -- Syriac, sixth beatitude (Matthew 5:8) { "בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ.", true }, -- Hebrew: Genesis 1:1 { "𞤀𞤣𞤤𞤢𞤥 𞤆𞤵𞤤𞤢𞤪", true }, -- Adlam: name of alphabet { -- Avestan: Hymn to Haoma: Yasna 10.8 ([[wikt:𐬀𐬉𐬴𐬨𐬀]]) "𐬬𐬍𐬯𐬞𐬈 ⸱ 𐬰𐬍 ⸱ 𐬀𐬥𐬌𐬌𐬉 ⸱ 𐬨𐬀𐬜𐬃𐬢𐬵𐬋 ⸱ 𐬀𐬉𐬴𐬨𐬀 ⸱ 𐬵𐬀𐬗𐬌𐬧𐬙𐬈 ⸱ 𐬑𐬭𐬎𐬎𐬍𐬨 ⸱ 𐬛𐬭𐬎𐬎𐬋 ⸱ 𐬁𐬀𐬝 ⸱ 𐬵𐬋 ⸱ 𐬫𐬋 ⸱ 𐬵𐬀𐬊𐬨𐬀𐬵𐬈 ⸱ 𐬨𐬀𐬜𐬋 ⸱ 𐬀𐬴𐬀 ⸱ 𐬵𐬀𐬗𐬀𐬌𐬙𐬈", true }, { "ދިވެހި", true }, -- the word dhivehi written in Thaana script { "𐤀𐤓𐤍𐤟𐤆𐤐𐤏𐤋𐤟𐤀𐤕𐤁𐤏𐤋𐤟𐤁𐤍𐤀𐤇𐤓𐤌𐤟𐤌𐤋𐤊𐤂𐤁𐤋𐤟𐤋𐤀𐤇𐤓𐤌𐤟𐤀𐤁𐤄", true }, -- Phoenician: Ahiram sarcophagus ([[wikt:𐤀𐤓𐤍]]) { "ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ", true }, -- Mandaic: manda ḏ'haije ("knowledge of life"; [[wikt:ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ]]) { "ࠄࠟࠓࠂࠝࠓࠜࠉࠆࠜࠉࠌ", true }, -- Samaritan Hebrew: īargerēzēm ("Mount Gerizim"; [[wikt:Mount Gerizim]]) { "%$!^&", false }, } self:iterate(examples, function (self, str, expected) self:equals(str, Unicode_data.is_rtl(str), expected) end) end return p