TagIterator (aka Tagiter)

Simple but very useful HTML/XHTML cascading parser.

Quickly iterate through tagged markup documents like HTML and XML. TagIterator is great for quick and dirty web scrapping.

Usage

  # sample html
  stext = <<-EOF
  <body> This is a test...
    <sub> S1 </sub> <sub> S2 </sub>
    <DL>
      <DT> A1
      <DT> A2
      <DT> A3
    </DL>
    <DL>
      <DT> B1
      <DT> B2
      <DT> B3
    </DL>
    <NEST>
      <P ALIGN="R">TOP</P>
      <NEST>
        <P>SECOND</P>
        <OL>
          <LI>C1
          <LI>C2
          <LI>C3
          <LI>C4
        </OL>
      </NEST>
      <OL>
        <LI>D1
        <LI>D2
        <LI>D3
        <LI>D4
      </OL>
    </NEST>
  </body>
  EOF

  a = TagIterator.new(stext)
  a.first("body") do |y|
    y.nth("dl",2) do |dl|
      dl.enumtag("dt") do |t|
        puts t.text.strip
      end
    end
    y.first("nest") do |n|
      n.first("p") do |c|
        print c.text, ' '
        puts c.attributes.collect{ |k,v| "#{k}=#{v}" }
      end.next("nest") do |m|
        m.first("p") do |c|
          puts c.text
        end.next("ol") do |o|
          o.enumtag("li") do |i| puts i.text.strip end
        end
      end.next("ol") do |o|
        o.enumtag("li") do |i| puts i.text.strip end
      end
    end
  end
  a.each_block("sub") do |y|
    puts y.text.strip
  end

produces

  B1
  B2
  B3
  TOP align=R
  SECOND
  C1
  C2
  C3
  C4
  D1
  D2
  D3
  D4
  S1
  S2
Methods
collect each_block enumcollect enumtag first for_this get_first get_nth new nth nth_tailer tagexist? tagnext
Attributes
[R] attributes
[RW] option
[R] tag
[R] text
Public Class methods
new(text,tag=nil,attributes={})
# File lib/more/facets/tagiterator.rb, line 120
  def initialize(text,tag=nil,attributes={})
    raise RuntimeError,"Only String accepted" unless text.is_a?(String)
    @text=text
    @option="pi"
    @tag=tag
    @attributes=attributes
    def @attributes.[](aname)
      super aname.downcase
    end
  end
Public Instance methods
collect(*arg)
# File lib/more/facets/tagiterator.rb, line 229
  def collect(*arg)
    a=[]
    each_block(*arg) do |tt| a.push tt end
    a
  end
each_block(tag,closetag=nil) {|self.class.new(@text[s..e],tag,parse_attribute(d))| ...}
# File lib/more/facets/tagiterator.rb, line 205
  def each_block(tag,closetag=nil)
    t=0
    s,d =find_opentag(tag)
    raise RuntimeError,"tag(#{tag}) not found" unless s

    while s do
      if closetag then
        e=find_closetag(closetag,s,tag)
      else
        e=find_closetag(tag,s)
      end
      e=-1 unless e
      yield self.class.new(@text[s..e],tag,parse_attribute(d))
      if e>=0 then 
        t=@text.index('>',e+1)
        t=@text.length unless t
        s,d = find_opentag(tag,t)
      else
        s=false
      end
    end
    self.class.new(text[t+1..-1])
  end
enumcollect(tag)
# File lib/more/facets/tagiterator.rb, line 245
  def enumcollect(tag)
    a=[]
    enumtag(tag) do |t| a.push t end
    a
  end
enumtag(tag) {|self.class.new(@text[s..e],tag,parse_attribute(d))| ...}
# File lib/more/facets/tagiterator.rb, line 235
  def enumtag(tag)
    s,d = find_openenumtag(tag)
    while s do
      e=find_closeenumtag(tag,s+1)
      e=-1 unless e
      yield self.class.new(@text[s..e],tag,parse_attribute(d))
      s,d = find_openenumtag(tag,s)
    end
  end
first(tag,*arg) {|f end end| ...}
# File lib/more/facets/tagiterator.rb, line 202
  def first(tag,*arg) nth(tag,1,*arg) do |f| yield f end end
  alias_method :next, :first

  def each_block(tag,closetag=nil)
    t=0
    s,d =find_opentag(tag)
    raise RuntimeError,"tag(#{tag}) not found" unless s

    while s do
      if closetag then
        e=find_closetag(closetag,s,tag)
      else
        e=find_closetag(tag,s)
      end
      e=-1 unless e
      yield self.class.new(@text[s..e],tag,parse_attribute(d))
      if e>=0 then 
        t=@text.index('>',e+1)
        t=@text.length unless t
        s,d = find_opentag(tag,t)
      else
        s=false
      end
    end
    self.class.new(text[t+1..-1])
  end

  def collect(*arg)
    a=[]
    each_block(*arg) do |tt| a.push tt end
    a
  end

  def enumtag(tag)
    s,d = find_openenumtag(tag)
    while s do
      e=find_closeenumtag(tag,s+1)
      e=-1 unless e
      yield self.class.new(@text[s..e],tag,parse_attribute(d))
      s,d = find_openenumtag(tag,s)
    end
  end

  def enumcollect(tag)
    a=[]
    enumtag(tag) do |t| a.push t end
    a
  end

  def for_this
    yield self
  end

  def get_nth(*arg) r=nil; nth(*arg) do |bl| r=bl end; r; end

  def get_first(*arg) r=nil; first(*arg) do |bl| r=bl end; r; end

  def tagexist?(tag,st=0)
    s=find_element(tag,st)
    if s then true else false end
  end

  def tagnext
    s=@text.index("<")
    return nil unless s
    e=@text.index(">",s)
    return nil unless s
    @text[s..e].scan(/[^<>\s]+/)[0]
  end

  def nth_tailer(tag,n)
    nth(tag,n) do end
  end

end



#  _____         _
# |_   _|__  ___| |_
#   | |/ _ \/ __| __|
#   | |  __/\__ \ |_
#   |_|\___||___/\__|
#

??

for_this() {|self| ...}
# File lib/more/facets/tagiterator.rb, line 251
  def for_this
    yield self
  end
get_first(*arg)
# File lib/more/facets/tagiterator.rb, line 257
  def get_first(*arg) r=nil; first(*arg) do |bl| r=bl end; r; end
get_nth(*arg)
# File lib/more/facets/tagiterator.rb, line 255
  def get_nth(*arg) r=nil; nth(*arg) do |bl| r=bl end; r; end
nth(tag,n,closetag=nil) {|self.class.new(text[s..e],tag,parse_attribute(d))| ...}
# File lib/more/facets/tagiterator.rb, line 179
  def nth(tag,n,closetag=nil)
    raise RuntimeError,"nth: number not specified" unless n
    t=0
    e=s=0   # for their scope
    d=nil

    1.upto(n) do |i|
      s,d = find_opentag(tag,t)
      raise RuntimeError,"tag(#{tag}) not found at(#{i})" unless s

      if closetag then
        e=find_closetag(closetag,s,tag)
      else
        e=find_closetag(tag,s)
      end
      e=-1 unless e
      t=@text.index('>',e+1)
      t=@text.length unless t
    end
    yield self.class.new(text[s..e],tag,parse_attribute(d))
    self.class.new(text[t+1..-1])
  end
nth_tailer(tag,n)
# File lib/more/facets/tagiterator.rb, line 272
  def nth_tailer(tag,n)
    nth(tag,n) do end
  end
tagexist?(tag,st=0)
# File lib/more/facets/tagiterator.rb, line 259
  def tagexist?(tag,st=0)
    s=find_element(tag,st)
    if s then true else false end
  end
tagnext()
# File lib/more/facets/tagiterator.rb, line 264
  def tagnext
    s=@text.index("<")
    return nil unless s
    e=@text.index(">",s)
    return nil unless s
    @text[s..e].scan(/[^<>\s]+/)[0]
  end