module Addressable::IDNA

Constants

ACE_MAX_LENGTH
ACE_PREFIX
COMPOSITION_TABLE
HANGUL_LBASE
HANGUL_LCOUNT
HANGUL_NCOUNT
HANGUL_SBASE
HANGUL_SCOUNT
HANGUL_TBASE
HANGUL_TCOUNT
HANGUL_VBASE
HANGUL_VCOUNT
PUNYCODE_BASE
PUNYCODE_DAMP
PUNYCODE_DELIMITER
PUNYCODE_INITIAL_BIAS
PUNYCODE_INITIAL_N
PUNYCODE_MAXINT
PUNYCODE_PRINT_ASCII
PUNYCODE_SKEW
PUNYCODE_TMAX
PUNYCODE_TMIN
UNICODE_DATA

This is a sparse Unicode table. Codepoints without entries are assumed to have the value: [0, 0, nil, nil, nil, nil, nil]

UNICODE_DATA_CANONICAL
UNICODE_DATA_COMBINING_CLASS
UNICODE_DATA_COMPATIBILITY
UNICODE_DATA_EXCLUSION
UNICODE_DATA_LOWERCASE
UNICODE_DATA_TITLECASE
UNICODE_DATA_UPPERCASE
UNICODE_MAX_LENGTH
UNICODE_TABLE

This module is loosely based on idn_actionmailer by Mick Staugaard, the unicode library by Yoshida Masato, and the punycode implementation by Kazuhiro Nishiyama. Most of the code was copied verbatim, but some reformatting was done, and some translation from C was done.

Without their code to work from as a base, we'd all still be relying on the presence of libidn. Which nobody ever seems to have installed.

Original sources: github.com/staugaard/idn_actionmailer www.yoshidam.net/Ruby.html#unicode rubyforge.org/frs/?group_id=2550

UTF8_REGEX
UTF8_REGEX_MULTIBYTE

Public Class Methods

lookup_unicode_combining_class(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 274
def self.lookup_unicode_combining_class(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
    0)
end
lookup_unicode_compatibility(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 282
def self.lookup_unicode_compatibility(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
end
lookup_unicode_composition(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 297
def self.lookup_unicode_composition(unpacked)
  return COMPOSITION_TABLE[unpacked]
end
lookup_unicode_lowercase(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 289
def self.lookup_unicode_lowercase(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    (codepoint_data[UNICODE_DATA_LOWERCASE] || codepoint) :
    codepoint)
end
punycode_adapt(delta, numpoints, firsttime) click to toggle source

Bias adaptation method

# File lib/addressable/idna/pure.rb, line 651
def self.punycode_adapt(delta, numpoints, firsttime)
  delta = firsttime ? delta / PUNYCODE_DAMP : delta >> 1
  # delta >> 1 is a faster way of doing delta / 2
  delta += delta / numpoints
  difference = PUNYCODE_BASE - PUNYCODE_TMIN

  k = 0
  while delta > (difference * PUNYCODE_TMAX) / 2
    delta /= difference
    k += PUNYCODE_BASE
  end

  k + (difference + 1) * delta / (delta + PUNYCODE_SKEW)
end
punycode_basic?(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 619
def self.punycode_basic?(codepoint)
  codepoint < 0x80
end
punycode_decode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 26
def self.punycode_decode(value)
  IDN::Punycode.decode(value)
end
punycode_decode_digit(codepoint) click to toggle source

Returns the numeric value of a basic codepoint (for use in representing integers) in the range 0 to base - 1, or PUNYCODE_BASE if codepoint does not represent a value.

# File lib/addressable/idna/pure.rb, line 637
def self.punycode_decode_digit(codepoint)
  if codepoint - 48 < 10
    codepoint - 22
  elsif codepoint - 65 < 26
    codepoint - 65
  elsif codepoint - 97 < 26
    codepoint - 97
  else
    PUNYCODE_BASE
  end
end
punycode_delimiter?(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 624
def self.punycode_delimiter?(codepoint)
  codepoint == PUNYCODE_DELIMITER
end
punycode_encode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 22
def self.punycode_encode(value)
  IDN::Punycode.encode(value)
end
punycode_encode_digit(d) click to toggle source
# File lib/addressable/idna/pure.rb, line 629
def self.punycode_encode_digit(d)
  d + 22 + 75 * ((d < 26) ? 1 : 0)
end
to_ascii(value) click to toggle source
# File lib/addressable/idna/native.rb, line 34
def self.to_ascii(value)
  IDN::Idna.toASCII(value)
end
to_unicode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 38
def self.to_unicode(value)
  IDN::Idna.toUnicode(value)
end
unicode_compose(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 129
def self.unicode_compose(unpacked)
  unpacked_result = []
  length = unpacked.length

  return unpacked if length == 0

  starter = unpacked[0]
  starter_cc = lookup_unicode_combining_class(starter)
  starter_cc = 256 if starter_cc != 0
  for i in 1...length
    ch = unpacked[i]
    cc = lookup_unicode_combining_class(ch)

    if (starter_cc == 0 &&
        (composite = unicode_compose_pair(starter, ch)) != nil)
      starter = composite
      startercc = lookup_unicode_combining_class(composite)
    else
      unpacked_result << starter
      starter = ch
      startercc = cc
    end
  end
  unpacked_result << starter
  return unpacked_result
end
unicode_compose_pair(ch_one, ch_two) click to toggle source
# File lib/addressable/idna/pure.rb, line 157
def self.unicode_compose_pair(ch_one, ch_two)
  if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
      ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
    # Hangul L + V
    return HANGUL_SBASE + (
      (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
    ) * HANGUL_TCOUNT
  elsif ch_one >= HANGUL_SBASE &&
      ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
      (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
      ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
       # Hangul LV + T
    return ch_one + (ch_two - HANGUL_TBASE)
  end

  p = []
  ucs4_to_utf8 = lambda do |ch|
    # For some reason, rcov likes to drop BUS errors here.
    if ch < 128
      p << ch
    elsif ch < 2048
      p << (ch >> 6 | 192)
      p << (ch & 63 | 128)
    elsif ch < 0x10000
      p << (ch >> 12 | 224)
      p << (ch >> 6 & 63 | 128)
      p << (ch & 63 | 128)
    elsif ch < 0x200000
      p << (ch >> 18 | 240)
      p << (ch >> 12 & 63 | 128)
      p << (ch >> 6 & 63 | 128)
      p << (ch & 63 | 128)
    elsif ch < 0x4000000
      p << (ch >> 24 | 248)
      p << (ch >> 18 & 63 | 128)
      p << (ch >> 12 & 63 | 128)
      p << (ch >> 6 & 63 | 128)
      p << (ch & 63 | 128)
    elsif ch < 0x80000000
      p << (ch >> 30 | 252)
      p << (ch >> 24 & 63 | 128)
      p << (ch >> 18 & 63 | 128)
      p << (ch >> 12 & 63 | 128)
      p << (ch >> 6 & 63 | 128)
      p << (ch & 63 | 128)
    end
  end

  ucs4_to_utf8.call(ch_one)
  ucs4_to_utf8.call(ch_two)

  return lookup_unicode_composition(p)
end
unicode_decompose(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 236
def self.unicode_decompose(unpacked)
  unpacked_result = []
  for cp in unpacked
    if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
      l, v, t = unicode_decompose_hangul(cp)
      unpacked_result << l
      unpacked_result << v if v
      unpacked_result << t if t
    else
      dc = lookup_unicode_compatibility(cp)
      unless dc
        unpacked_result << cp
      else
        unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
      end
    end
  end
  return unpacked_result
end
unicode_decompose_hangul(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 257
def self.unicode_decompose_hangul(codepoint)
  sindex = codepoint - HANGUL_SBASE;
  if sindex < 0 || sindex >= HANGUL_SCOUNT
    l = codepoint
    v = t = nil
    return l, v, t
  end
  l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
  v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
  t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
  if t == HANGUL_TBASE
    t = nil
  end
  return l, v, t
end
unicode_downcase(input) click to toggle source

Unicode aware downcase method.

@api private @param [String] input

The input string.

@return [String] The downcased result.

# File lib/addressable/idna/pure.rb, line 122
def self.unicode_downcase(input)
  unpacked = input.unpack("U*")
  unpacked.map! { |codepoint| lookup_unicode_lowercase(codepoint) }
  return unpacked.pack("U*")
end
unicode_normalize_kc(value) click to toggle source
# File lib/addressable/idna/native.rb, line 30
def self.unicode_normalize_kc(value)
  IDN::Stringprep.nfkc_normalize(value)
end
unicode_sort_canonical(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 212
def self.unicode_sort_canonical(unpacked)
  unpacked = unpacked.dup
  i = 1
  length = unpacked.length

  return unpacked if length < 2

  while i < length
    last = unpacked[i-1]
    ch = unpacked[i]
    last_cc = lookup_unicode_combining_class(last)
    cc = lookup_unicode_combining_class(ch)
    if cc != 0 && last_cc != 0 && last_cc > cc
      unpacked[i] = last
      unpacked[i-1] = ch
      i -= 1 if i > 1
    else
      i += 1
    end
  end
  return unpacked
end