XQuery返回Unicode块的名称?

时间:2014-05-20 17:49:53

标签: unicode xquery

我有一个XQuery,它返回我输入目录中不在BasicLatin unicode块中的不同字符集。我不想返回字符本身,而是希望查询返回它们所在的unicode块的名称(例如“Latin-1 Supplement”而不是当前的“ä”(变音符号))。有没有办法做到这一点?

xquery version "1.0";
declare namespace ead = "urn:isbn:1-931666-22-9";
declare default element namespace "urn:isbn:1-931666-22-9";
declare copy-namespaces no-preserve, inherit;

import module namespace functx="http://www.functx.com" 
at "http://www.xqueryfunctions.com/xq/functx-1.0-doc-2007-01.xq";


declare variable $COLL as document-node()+ := collection("[input_url]");

let $non-BasicLatin_all := $COLL//text()[matches(., '\P{IsBasicLatin}')]
let $non-BasicLatin_item :=
    for $x in $non-BasicLatin_all
    return functx:get-matches($x, '\P{IsBasicLatin}')  
let $distinct_character := 
    for $x in distinct-values($non-BasicLatin_item)
    return <character>{$x}</character>

return
<characters distinct-count="{count($distinct_character)}">
{$distinct_character}
</characters>

3 个答案:

答案 0 :(得分:1)

Unicode字符集数据库的XML版本可从the Unicode Consortium获得;数据的平面和分组版本都包含具有以下形式的块列表:

<blocks>
  <block first-cp="0000" last-cp="007F" name="Basic Latin"/>
  <block first-cp="0080" last-cp="00FF" name="Latin-1 Supplement"/>
  <block first-cp="0100" last-cp="017F" name="Latin Extended-A"/>
  <block first-cp="0180" last-cp="024F" name="Latin Extended-B"/>
  ...
  <block first-cp="2F800" last-cp="2FA1F" name="CJK Compatibility Ideographs Supplement"/>
  <block first-cp="E0000" last-cp="E007F" name="Tags"/>
  <block first-cp="E0100" last-cp="E01EF" name="Variation Selectors Supplement"/>
  <block first-cp="F0000" last-cp="FFFFF" name="Supplementary Private Use Area-A"/>
  <block first-cp="100000" last-cp="10FFFF" name="Supplementary Private Use Area-B"/>
</blocks>

编写一个接受字符的函数,查询数据库中的块范围(将它们从十六进制转换为十进制进行比较),并返回块的名称(或其块元素)。如果速度成为一个问题,请复制一个阻止列表并对所有进行一次十进制到十进制的转换。

请注意,在Unicode的生命周期中,某些块名称和一些块边界已发生变化;您需要决定要使用的数据库版本。

答案 1 :(得分:0)

XQuery中没有直接的功能。您必须解析字符的代码点并找到一个数据库来自行解析unicode块,可能最适合XML文档。

使用string-to-codepoints('漢字')可以实现解析代码点,这将返回一个整数序列。

答案 2 :(得分:0)

如果其他人可能觉得这很有用,这就是我根据Jens Erat和C.M.提出的。 Sperberg-McQueer的建议(使用Unicode 6.3.0):

xquery version "1.0";
declare namespace ead = "urn:isbn:1-931666-22-9";
declare default element namespace "urn:isbn:1-931666-22-9";
declare copy-namespaces no-preserve, inherit;

import module namespace functx="http://www.functx.com" 
at "http://www.xqueryfunctions.com/xq/functx-1.0-doc-2007-01.xq";


declare variable $COLL as document-node()+ := collection("[input_directory]");

let $non-BasicLatin_all := $COLL//text()[matches(., '\P{IsBasicLatin}')]
let $non-BasicLatin_item :=
    for $x in $non-BasicLatin_all
    return functx:get-matches($x, '\P{IsBasicLatin}')  
let $distinct_character := 
    for $x in distinct-values($non-BasicLatin_item)
    order by $x
    return 
    if($x[not(.='')])
    then string-to-codepoints($x)
    else()
let $block-name :=
    for $x in distinct-values($distinct_character)
    return
    if (functx:between-inclusive($x, 0, 127)) then "Basic Latin"
    else if (functx:between-inclusive($x, 128, 255)) then "Latin-1 Supplement"
    else if (functx:between-inclusive($x, 256, 383)) then "Latin Extended-A"
    else if (functx:between-inclusive($x, 384, 591)) then "Latin Extended-B"
    else if (functx:between-inclusive($x, 592, 687)) then "IPA Extensions"
    else if (functx:between-inclusive($x, 688, 767)) then "Spacing Modifier Letters"
    else if (functx:between-inclusive($x, 768, 879)) then "Combining Diacritical Marks"
    else if (functx:between-inclusive($x, 880, 1023)) then "Greek and Coptic"
    else if (functx:between-inclusive($x, 1024, 1279)) then "Cyrillic"
    else if (functx:between-inclusive($x, 1280, 1327)) then "Cyrillic Supplement"
    else if (functx:between-inclusive($x, 1328, 1423)) then "Armenian"
    else if (functx:between-inclusive($x, 1424, 1535)) then "Hebrew"
    else if (functx:between-inclusive($x, 1536, 1791)) then "Arabic"
    else if (functx:between-inclusive($x, 1792, 1871)) then "Syriac"
    else if (functx:between-inclusive($x, 1872, 1919)) then "Arabic Supplement"
    else if (functx:between-inclusive($x, 1920, 1983)) then "Thaana"
    else if (functx:between-inclusive($x, 1984, 2047)) then "NKo"
    else if (functx:between-inclusive($x, 2048, 2111)) then "Samaritan"
    else if (functx:between-inclusive($x, 2112, 2143)) then "Mandaic"
    else if (functx:between-inclusive($x, 2208, 2303)) then "Arabic Extended-A"
    else if (functx:between-inclusive($x, 2304, 2431)) then "Devanagari"
    else if (functx:between-inclusive($x, 2432, 2559)) then "Bengali"
    else if (functx:between-inclusive($x, 2560, 2687)) then "Gurmukhi"
    else if (functx:between-inclusive($x, 2688, 2815)) then "Gujarati"
    else if (functx:between-inclusive($x, 2816, 2943)) then "Oriya"
    else if (functx:between-inclusive($x, 2944, 3071)) then "Tamil"
    else if (functx:between-inclusive($x, 3072, 3199)) then "Telugu"
    else if (functx:between-inclusive($x, 3200, 3327)) then "Kannada"
    else if (functx:between-inclusive($x, 3328, 3455)) then "Malayalam"
    else if (functx:between-inclusive($x, 3456, 3583)) then "Sinhala"
    else if (functx:between-inclusive($x, 3584, 3711)) then "Thai"
    else if (functx:between-inclusive($x, 3712, 3839)) then "Lao"
    else if (functx:between-inclusive($x, 3840, 4095)) then "Tibetan"
    else if (functx:between-inclusive($x, 4096, 4255)) then "Myanmar"
    else if (functx:between-inclusive($x, 4256, 4351)) then "Georgian"
    else if (functx:between-inclusive($x, 4352, 4607)) then "Hangul Jamo"
    else if (functx:between-inclusive($x, 4608, 4991)) then "Ethiopic"
    else if (functx:between-inclusive($x, 4992, 5023)) then "Ethiopic Supplement"
    else if (functx:between-inclusive($x, 5024, 5119)) then "Cherokee"
    else if (functx:between-inclusive($x, 5120, 5759)) then "Unified Canadian Aboriginal Syllabics"
    else if (functx:between-inclusive($x, 5760, 5791)) then "Ogham"
    else if (functx:between-inclusive($x, 5792, 5887)) then "Runic"
    else if (functx:between-inclusive($x, 5888, 5919)) then "Tagalog"
    else if (functx:between-inclusive($x, 5920, 5951)) then "Hanunoo"
    else if (functx:between-inclusive($x, 5952, 5983)) then "Buhid"
    else if (functx:between-inclusive($x, 5984, 6015)) then "Tagbanwa"
    else if (functx:between-inclusive($x, 6016, 6143)) then "Khmer"
    else if (functx:between-inclusive($x, 6144, 6319)) then "Mongolian"
    else if (functx:between-inclusive($x, 6320, 6399)) then "Unified Canadian Aboriginal Syllabics Extended"
    else if (functx:between-inclusive($x, 6400, 6479)) then "Limbu"
    else if (functx:between-inclusive($x, 6480, 6527)) then "Tai Le"
    else if (functx:between-inclusive($x, 6528, 6623)) then "New Tai Lue"
    else if (functx:between-inclusive($x, 6624, 6655)) then "Khmer Symbols"
    else if (functx:between-inclusive($x, 6656, 6687)) then "Buginese"
    else if (functx:between-inclusive($x, 6688, 6831)) then "Tai Tham"
    else if (functx:between-inclusive($x, 6912, 7039)) then "Balinese"
    else if (functx:between-inclusive($x, 7040, 7103)) then "Sundanese"
    else if (functx:between-inclusive($x, 7104, 7167)) then "Batak"
    else if (functx:between-inclusive($x, 7168, 7247)) then "Lepcha"
    else if (functx:between-inclusive($x, 7248, 7295)) then "Ol Chiki"
    else if (functx:between-inclusive($x, 7360, 7375)) then "Sundanese Supplement"
    else if (functx:between-inclusive($x, 7376, 7423)) then "Vedic Extensions"
    else if (functx:between-inclusive($x, 7424, 7551)) then "Phonetic Extensions"
    else if (functx:between-inclusive($x, 7552, 7615)) then "Phonetic Extensions Supplement"
    else if (functx:between-inclusive($x, 7616, 7679)) then "Combining Diacritical Marks Supplement"
    else if (functx:between-inclusive($x, 7680, 7935)) then "Latin Extended Additional"
    else if (functx:between-inclusive($x, 7936, 8191)) then "Greek Extended"
    else if (functx:between-inclusive($x, 8192, 8303)) then "General Punctuation"
    else if (functx:between-inclusive($x, 8304, 8351)) then "Superscripts and Subscripts"
    else if (functx:between-inclusive($x, 8352, 8399)) then "Currency Symbols"
    else if (functx:between-inclusive($x, 8400, 8447)) then "Combining Diacritical Marks for Symbols"
    else if (functx:between-inclusive($x, 8448, 8527)) then "Letterlike Symbols"
    else if (functx:between-inclusive($x, 8528, 8591)) then "Number Forms"
    else if (functx:between-inclusive($x, 8592, 8703)) then "Arrows"
    else if (functx:between-inclusive($x, 8704, 8959)) then "Mathematical Operators"
    else if (functx:between-inclusive($x, 8960, 9215)) then "Miscellaneous Technical"
    else if (functx:between-inclusive($x, 9216, 9279)) then "Control Pictures"
    else if (functx:between-inclusive($x, 9280, 9311)) then "Optical Character Recognition"
    else if (functx:between-inclusive($x, 9312, 9471)) then "Enclosed Alphanumerics"
    else if (functx:between-inclusive($x, 9472, 9599)) then "Box Drawing"
    else if (functx:between-inclusive($x, 9600, 9631)) then "Block Elements"
    else if (functx:between-inclusive($x, 9632, 9727)) then "Geometric Shapes"
    else if (functx:between-inclusive($x, 9728, 9983)) then "Miscellaneous Symbols"
    else if (functx:between-inclusive($x, 9984, 10175)) then "Dingbats"
    else if (functx:between-inclusive($x, 10176, 10223)) then "Miscellaneous Mathematical Symbols-A"
    else if (functx:between-inclusive($x, 10224, 10239)) then "Supplemental Arrows-A"
    else if (functx:between-inclusive($x, 10240, 10495)) then "Braille Patterns"
    else if (functx:between-inclusive($x, 10496, 10623)) then "Supplemental Arrows-B"
    else if (functx:between-inclusive($x, 10624, 10751)) then "Miscellaneous Mathematical Symbols-B"
    else if (functx:between-inclusive($x, 10752, 11007)) then "Supplemental Mathematical Operators"
    else if (functx:between-inclusive($x, 11008, 11263)) then "Miscellaneous Symbols and Arrows"
    else if (functx:between-inclusive($x, 11264, 11359)) then "Glagolitic"
    else if (functx:between-inclusive($x, 11360, 11391)) then "Latin Extended-C"
    else if (functx:between-inclusive($x, 11392, 11519)) then "Coptic"
    else if (functx:between-inclusive($x, 11520, 11567)) then "Georgian Supplement"
    else if (functx:between-inclusive($x, 11568, 11647)) then "Tifinagh"
    else if (functx:between-inclusive($x, 11648, 11743)) then "Ethiopic Extended"
    else if (functx:between-inclusive($x, 11744, 11775)) then "Cyrillic Extended-A"
    else if (functx:between-inclusive($x, 11776, 11903)) then "Supplemental Punctuation"
    else if (functx:between-inclusive($x, 11904, 12031)) then "CJK Radicals Supplement"
    else if (functx:between-inclusive($x, 12032, 12255)) then "Kangxi Radicals"
    else if (functx:between-inclusive($x, 12272, 12287)) then "Ideographic Description Characters"
    else if (functx:between-inclusive($x, 12288, 12351)) then "CJK Symbols and Punctuation"
    else if (functx:between-inclusive($x, 12352, 12447)) then "Hiragana"
    else if (functx:between-inclusive($x, 12448, 12543)) then "Katakana"
    else if (functx:between-inclusive($x, 12544, 12591)) then "Bopomofo"
    else if (functx:between-inclusive($x, 12592, 12687)) then "Hangul Compatibility Jamo"
    else if (functx:between-inclusive($x, 12688, 12703)) then "Kanbun"
    else if (functx:between-inclusive($x, 12704, 12735)) then "Bopomofo Extended"
    else if (functx:between-inclusive($x, 12736, 12783)) then "CJK Strokes"
    else if (functx:between-inclusive($x, 12784, 12799)) then "Katakana Phonetic Extensions"
    else if (functx:between-inclusive($x, 12800, 13055)) then "Enclosed CJK Letters and Months"
    else if (functx:between-inclusive($x, 13056, 13311)) then "CJK Compatibility"
    else if (functx:between-inclusive($x, 13312, 19903)) then "CJK Unified Ideographs Extension A"
    else if (functx:between-inclusive($x, 19904, 19967)) then "Yijing Hexagram Symbols"
    else if (functx:between-inclusive($x, 19968, 40959)) then "CJK Unified Ideographs"
    else if (functx:between-inclusive($x, 40960, 42127)) then "Yi Syllables"
    else if (functx:between-inclusive($x, 42128, 42191)) then "Yi Radicals"
    else if (functx:between-inclusive($x, 42192, 42239)) then "Lisu"
    else if (functx:between-inclusive($x, 42240, 42559)) then "Vai"
    else if (functx:between-inclusive($x, 42560, 42655)) then "Cyrillic Extended-B"
    else if (functx:between-inclusive($x, 42656, 42751)) then "Bamum"
    else if (functx:between-inclusive($x, 42752, 42783)) then "Modifier Tone Letters"
    else if (functx:between-inclusive($x, 42784, 43007)) then "Latin Extended-D"
    else if (functx:between-inclusive($x, 43008, 43055)) then "Syloti Nagri"
    else if (functx:between-inclusive($x, 43056, 43071)) then "Common Indic Number Forms"
    else if (functx:between-inclusive($x, 43072, 43135)) then "Phags-pa"
    else if (functx:between-inclusive($x, 43136, 43231)) then "Saurashtra"
    else if (functx:between-inclusive($x, 43232, 43263)) then "Devanagari Extended"
    else if (functx:between-inclusive($x, 43264, 43311)) then "Kayah Li"
    else if (functx:between-inclusive($x, 43312, 43359)) then "Rejang"
    else if (functx:between-inclusive($x, 43360, 43391)) then "Hangul Jamo Extended-A"
    else if (functx:between-inclusive($x, 43392, 43487)) then "Javanese"
    else if (functx:between-inclusive($x, 43520, 43615)) then "Cham"
    else if (functx:between-inclusive($x, 43616, 43647)) then "Myanmar Extended-A"
    else if (functx:between-inclusive($x, 43648, 43743)) then "Tai Viet"
    else if (functx:between-inclusive($x, 43744, 43775)) then "Meetei Mayek Extensions"
    else if (functx:between-inclusive($x, 43776, 43823)) then "Ethiopic Extended-A"
    else if (functx:between-inclusive($x, 43968, 44031)) then "Meetei Mayek"
    else if (functx:between-inclusive($x, 44032, 55215)) then "Hangul Syllables"
    else if (functx:between-inclusive($x, 55216, 55295)) then "Hangul Jamo Extended-B"
    else if (functx:between-inclusive($x, 55296, 56191)) then "High Surrogates"
    else if (functx:between-inclusive($x, 56192, 56319)) then "High Private Use Surrogates"
    else if (functx:between-inclusive($x, 56320, 57343)) then "Low Surrogates"
    else if (functx:between-inclusive($x, 57344, 63743)) then "Private Use Area"
    else if (functx:between-inclusive($x, 63744, 64255)) then "CJK Compatibility Ideographs"
    else if (functx:between-inclusive($x, 64256, 64335)) then "Alphabetic Presentation Forms"
    else if (functx:between-inclusive($x, 64336, 65023)) then "Arabic Presentation Forms-A"
    else if (functx:between-inclusive($x, 65024, 65039)) then "Variation Selectors"
    else if (functx:between-inclusive($x, 65040, 65055)) then "Vertical Forms"
    else if (functx:between-inclusive($x, 65056, 65071)) then "Combining Half Marks"
    else if (functx:between-inclusive($x, 65072, 65103)) then "CJK Compatibility Forms"
    else if (functx:between-inclusive($x, 65104, 65135)) then "Small Form Variants"
    else if (functx:between-inclusive($x, 65136, 65279)) then "Arabic Presentation Forms-B"
    else if (functx:between-inclusive($x, 65280, 65519)) then "Halfwidth and Fullwidth Forms"
    else if (functx:between-inclusive($x, 65520, 65535)) then "Specials"
    else if (functx:between-inclusive($x, 65536, 65663)) then "Linear B Syllabary"
    else if (functx:between-inclusive($x, 65664, 65791)) then "Linear B Ideograms"
    else if (functx:between-inclusive($x, 65792, 65855)) then "Aegean Numbers"
    else if (functx:between-inclusive($x, 65856, 65935)) then "Ancient Greek Numbers"
    else if (functx:between-inclusive($x, 65936, 65999)) then "Ancient Symbols"
    else if (functx:between-inclusive($x, 66000, 66047)) then "Phaistos Disc"
    else if (functx:between-inclusive($x, 66176, 66207)) then "Lycian"
    else if (functx:between-inclusive($x, 66208, 66271)) then "Carian"
    else if (functx:between-inclusive($x, 66304, 66351)) then "Old Italic"
    else if (functx:between-inclusive($x, 66352, 66383)) then "Gothic"
    else if (functx:between-inclusive($x, 66432, 66463)) then "Ugaritic"
    else if (functx:between-inclusive($x, 66464, 66527)) then "Old Persian"
    else if (functx:between-inclusive($x, 66560, 66639)) then "Deseret"
    else if (functx:between-inclusive($x, 66640, 66687)) then "Shavian"
    else if (functx:between-inclusive($x, 66688, 66735)) then "Osmanya"
    else if (functx:between-inclusive($x, 67584, 67647)) then "Cypriot Syllabary"
    else if (functx:between-inclusive($x, 67648, 67679)) then "Imperial Aramaic"
    else if (functx:between-inclusive($x, 67840, 67871)) then "Phoenician"
    else if (functx:between-inclusive($x, 67872, 67903)) then "Lydian"
    else if (functx:between-inclusive($x, 67968, 67999)) then "Meroitic Hieroglyphs"
    else if (functx:between-inclusive($x, 68000, 68095)) then "Meroitic Cursive"
    else if (functx:between-inclusive($x, 68096, 68191)) then "Kharoshthi"
    else if (functx:between-inclusive($x, 68192, 68223)) then "Old South Arabian"
    else if (functx:between-inclusive($x, 68352, 68415)) then "Avestan"
    else if (functx:between-inclusive($x, 68416, 68447)) then "Inscriptional Parthian"
    else if (functx:between-inclusive($x, 68448, 68479)) then "Inscriptional Pahlavi"
    else if (functx:between-inclusive($x, 68608, 68687)) then "Old Turkic"
    else if (functx:between-inclusive($x, 69216, 69247)) then "Rumi Numeral Symbols"
    else if (functx:between-inclusive($x, 69632, 69759)) then "Brahmi"
    else if (functx:between-inclusive($x, 69760, 69839)) then "Kaithi"
    else if (functx:between-inclusive($x, 69840, 69887)) then "Sora Sompeng"
    else if (functx:between-inclusive($x, 69888, 69967)) then "Chakma"
    else if (functx:between-inclusive($x, 70016, 70111)) then "Sharada"
    else if (functx:between-inclusive($x, 71296, 71375)) then "Takri"
    else if (functx:between-inclusive($x, 73728, 74751)) then "Cuneiform"
    else if (functx:between-inclusive($x, 74752, 74879)) then "Cuneiform Numbers and Punctuation"
    else if (functx:between-inclusive($x, 77824, 78895)) then "Egyptian Hieroglyphs"
    else if (functx:between-inclusive($x, 92160, 92735)) then "Bamum Supplement"
    else if (functx:between-inclusive($x, 93952, 94111)) then "Miao"
    else if (functx:between-inclusive($x, 110592, 110847)) then "Kana Supplement"
    else if (functx:between-inclusive($x, 118784, 119039)) then "Byzantine Musical Symbols"
    else if (functx:between-inclusive($x, 119040, 119295)) then "Musical Symbols"
    else if (functx:between-inclusive($x, 119296, 119375)) then "Ancient Greek Musical Notation"
    else if (functx:between-inclusive($x, 119552, 119647)) then "Tai Xuan Jing Symbols"
    else if (functx:between-inclusive($x, 119648, 119679)) then "Counting Rod Numerals"
    else if (functx:between-inclusive($x, 119808, 120831)) then "Mathematical Alphanumeric Symbols"
    else if (functx:between-inclusive($x, 126464, 126719)) then "Arabic Mathematical Alphabetic Symbols"
    else if (functx:between-inclusive($x, 126976, 127023)) then "Mahjong Tiles"
    else if (functx:between-inclusive($x, 127024, 127135)) then "Domino Tiles"
    else if (functx:between-inclusive($x, 127136, 127231)) then "Playing Cards"
    else if (functx:between-inclusive($x, 127232, 127487)) then "Enclosed Alphanumeric Supplement"
    else if (functx:between-inclusive($x, 127488, 127743)) then "Enclosed Ideographic Supplement"
    else if (functx:between-inclusive($x, 127744, 128511)) then "Miscellaneous Symbols And Pictographs"
    else if (functx:between-inclusive($x, 128512, 128591)) then "Emoticons"
    else if (functx:between-inclusive($x, 128640, 128767)) then "Transport And Map Symbols"
    else if (functx:between-inclusive($x, 128768, 128895)) then "Alchemical Symbols"
    else if (functx:between-inclusive($x, 131072, 173791)) then "CJK Unified Ideographs Extension B"
    else if (functx:between-inclusive($x, 173824, 177983)) then "CJK Unified Ideographs Extension C"
    else if (functx:between-inclusive($x, 177984, 178207)) then "CJK Unified Ideographs Extension D"
    else if (functx:between-inclusive($x, 194560, 195103)) then "CJK Compatibility Ideographs Supplement"
    else if (functx:between-inclusive($x, 917504, 917631)) then "Tags"
    else if (functx:between-inclusive($x, 917760, 917999)) then "Variation Selectors Supplement"
    else if (functx:between-inclusive($x, 983040, 1048575)) then "Supplementary Private Use Area-A"
    else if (functx:between-inclusive($x, 1048576, 1114111)) then "Supplementary Private Use Area-B"
    else()
let $distinct_block-name :=
    for $x in distinct-values($block-name)
    order by $x
    return <block>{$x}</block>

return
    <distinct-blocks distinct-block-count="{count($distinct_block-name)}" distinct-character-count="{count($distinct_character)}">
    {$distinct_block-name}
    </distinct-blocks>