TagIterator (aka Tagiter)
Simple but very useful HTML/XHTML cascading parser.
Quickly iterate through tagged markup documents like HTML and XML. TagIterator is great for quick and dirty web scrapping.
Usage
# sample html stext = <<-EOF <body> This is a test... <sub> S1 </sub> <sub> S2 </sub> <DL> <DT> A1 <DT> A2 <DT> A3 </DL> <DL> <DT> B1 <DT> B2 <DT> B3 </DL> <NEST> <P ALIGN="R">TOP</P> <NEST> <P>SECOND</P> <OL> <LI>C1 <LI>C2 <LI>C3 <LI>C4 </OL> </NEST> <OL> <LI>D1 <LI>D2 <LI>D3 <LI>D4 </OL> </NEST> </body> EOF a = TagIterator.new(stext) a.first("body") do |y| y.nth("dl",2) do |dl| dl.enumtag("dt") do |t| puts t.text.strip end end y.first("nest") do |n| n.first("p") do |c| print c.text, ' ' puts c.attributes.collect{ |k,v| "#{k}=#{v}" } end.next("nest") do |m| m.first("p") do |c| puts c.text end.next("ol") do |o| o.enumtag("li") do |i| puts i.text.strip end end end.next("ol") do |o| o.enumtag("li") do |i| puts i.text.strip end end end end a.each_block("sub") do |y| puts y.text.strip end
produces
B1 B2 B3 TOP align=R SECOND C1 C2 C3 C4 D1 D2 D3 D4 S1 S2
Methods
collect
each_block
enumcollect
enumtag
first
for_this
get_first
get_nth
new
nth
nth_tailer
tagexist?
tagnext
Attributes
[R] | attributes | |
[RW] | option | |
[R] | tag | |
[R] | text |
Public Class methods
[ + ]
# File lib/more/facets/tagiterator.rb, line 120 def initialize(text,tag=nil,attributes={}) raise RuntimeError,"Only String accepted" unless text.is_a?(String) @text=text @option="pi" @tag=tag @attributes=attributes def @attributes.[](aname) super aname.downcase end end
Public Instance methods
[ + ]
# File lib/more/facets/tagiterator.rb, line 229 def collect(*arg) a=[] each_block(*arg) do |tt| a.push tt end a end
[ + ]
# File lib/more/facets/tagiterator.rb, line 205 def each_block(tag,closetag=nil) t=0 s,d =find_opentag(tag) raise RuntimeError,"tag(#{tag}) not found" unless s while s do if closetag then e=find_closetag(closetag,s,tag) else e=find_closetag(tag,s) end e=-1 unless e yield self.class.new(@text[s..e],tag,parse_attribute(d)) if e>=0 then t=@text.index('>',e+1) t=@text.length unless t s,d = find_opentag(tag,t) else s=false end end self.class.new(text[t+1..-1]) end
[ + ]
# File lib/more/facets/tagiterator.rb, line 245 def enumcollect(tag) a=[] enumtag(tag) do |t| a.push t end a end
[ + ]
# File lib/more/facets/tagiterator.rb, line 235 def enumtag(tag) s,d = find_openenumtag(tag) while s do e=find_closeenumtag(tag,s+1) e=-1 unless e yield self.class.new(@text[s..e],tag,parse_attribute(d)) s,d = find_openenumtag(tag,s) end end
[ + ]
# File lib/more/facets/tagiterator.rb, line 202 def first(tag,*arg) nth(tag,1,*arg) do |f| yield f end end alias_method :next, :first def each_block(tag,closetag=nil) t=0 s,d =find_opentag(tag) raise RuntimeError,"tag(#{tag}) not found" unless s while s do if closetag then e=find_closetag(closetag,s,tag) else e=find_closetag(tag,s) end e=-1 unless e yield self.class.new(@text[s..e],tag,parse_attribute(d)) if e>=0 then t=@text.index('>',e+1) t=@text.length unless t s,d = find_opentag(tag,t) else s=false end end self.class.new(text[t+1..-1]) end def collect(*arg) a=[] each_block(*arg) do |tt| a.push tt end a end def enumtag(tag) s,d = find_openenumtag(tag) while s do e=find_closeenumtag(tag,s+1) e=-1 unless e yield self.class.new(@text[s..e],tag,parse_attribute(d)) s,d = find_openenumtag(tag,s) end end def enumcollect(tag) a=[] enumtag(tag) do |t| a.push t end a end def for_this yield self end def get_nth(*arg) r=nil; nth(*arg) do |bl| r=bl end; r; end def get_first(*arg) r=nil; first(*arg) do |bl| r=bl end; r; end def tagexist?(tag,st=0) s=find_element(tag,st) if s then true else false end end def tagnext s=@text.index("<") return nil unless s e=@text.index(">",s) return nil unless s @text[s..e].scan(/[^<>\s]+/)[0] end def nth_tailer(tag,n) nth(tag,n) do end end end # _____ _ # |_ _|__ ___| |_ # | |/ _ \/ __| __| # | | __/\__ \ |_ # |_|\___||___/\__| # ??
[ + ]
# File lib/more/facets/tagiterator.rb, line 251 def for_this yield self end
[ + ]
# File lib/more/facets/tagiterator.rb, line 257 def get_first(*arg) r=nil; first(*arg) do |bl| r=bl end; r; end
[ + ]
# File lib/more/facets/tagiterator.rb, line 255 def get_nth(*arg) r=nil; nth(*arg) do |bl| r=bl end; r; end
[ + ]
# File lib/more/facets/tagiterator.rb, line 179 def nth(tag,n,closetag=nil) raise RuntimeError,"nth: number not specified" unless n t=0 e=s=0 # for their scope d=nil 1.upto(n) do |i| s,d = find_opentag(tag,t) raise RuntimeError,"tag(#{tag}) not found at(#{i})" unless s if closetag then e=find_closetag(closetag,s,tag) else e=find_closetag(tag,s) end e=-1 unless e t=@text.index('>',e+1) t=@text.length unless t end yield self.class.new(text[s..e],tag,parse_attribute(d)) self.class.new(text[t+1..-1]) end
[ + ]
# File lib/more/facets/tagiterator.rb, line 272 def nth_tailer(tag,n) nth(tag,n) do end end
[ + ]
# File lib/more/facets/tagiterator.rb, line 259 def tagexist?(tag,st=0) s=find_element(tag,st) if s then true else false end end
[ + ]
# File lib/more/facets/tagiterator.rb, line 264 def tagnext s=@text.index("<") return nil unless s e=@text.index(">",s) return nil unless s @text[s..e].scan(/[^<>\s]+/)[0] end