Gumbo  1.0
A C library for parsing HTML.
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Pages
gumbo.h
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16 //
17 // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
18 // GUMBO_ as a prefix for enum constants (static constants get the Google-style
19 // kGumbo prefix).
20 
42 #ifndef GUMBO_GUMBO_H_
43 #define GUMBO_GUMBO_H_
44 
45 #include <stdbool.h>
46 #include <stddef.h>
47 
48 #ifdef __cplusplus
49 extern "C" {
50 #endif
51 
62 typedef struct {
63  unsigned int line;
64  unsigned int column;
65  unsigned int offset;
67 
73 
74 
84 typedef struct {
86  const char* data;
87 
89  size_t length;
91 
94 
100  const GumboStringPiece* str1, const GumboStringPiece* str2);
101 
107  const GumboStringPiece* str1, const GumboStringPiece* str2);
108 
109 
119 typedef struct {
123  void** data;
124 
126  unsigned int length;
127 
129  unsigned int capacity;
130 } GumboVector;
131 
133 extern const GumboVector kGumboEmptyVector;
134 
139 int gumbo_vector_index_of(GumboVector* vector, void* element);
140 
141 
154 typedef enum {
155  // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
156  GUMBO_TAG_HTML,
157  // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
158  GUMBO_TAG_HEAD,
159  GUMBO_TAG_TITLE,
160  GUMBO_TAG_BASE,
161  GUMBO_TAG_LINK,
162  GUMBO_TAG_META,
163  GUMBO_TAG_STYLE,
164  // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
165  GUMBO_TAG_SCRIPT,
166  GUMBO_TAG_NOSCRIPT,
167  GUMBO_TAG_TEMPLATE,
168  // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
169  GUMBO_TAG_BODY,
170  GUMBO_TAG_ARTICLE,
171  GUMBO_TAG_SECTION,
172  GUMBO_TAG_NAV,
173  GUMBO_TAG_ASIDE,
174  GUMBO_TAG_H1,
175  GUMBO_TAG_H2,
176  GUMBO_TAG_H3,
177  GUMBO_TAG_H4,
178  GUMBO_TAG_H5,
179  GUMBO_TAG_H6,
180  GUMBO_TAG_HGROUP,
181  GUMBO_TAG_HEADER,
182  GUMBO_TAG_FOOTER,
183  GUMBO_TAG_ADDRESS,
184  // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
185  GUMBO_TAG_P,
186  GUMBO_TAG_HR,
187  GUMBO_TAG_PRE,
188  GUMBO_TAG_BLOCKQUOTE,
189  GUMBO_TAG_OL,
190  GUMBO_TAG_UL,
191  GUMBO_TAG_LI,
192  GUMBO_TAG_DL,
193  GUMBO_TAG_DT,
194  GUMBO_TAG_DD,
195  GUMBO_TAG_FIGURE,
196  GUMBO_TAG_FIGCAPTION,
197  GUMBO_TAG_MAIN,
198  GUMBO_TAG_DIV,
199  // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
200  GUMBO_TAG_A,
201  GUMBO_TAG_EM,
202  GUMBO_TAG_STRONG,
203  GUMBO_TAG_SMALL,
204  GUMBO_TAG_S,
205  GUMBO_TAG_CITE,
206  GUMBO_TAG_Q,
207  GUMBO_TAG_DFN,
208  GUMBO_TAG_ABBR,
209  GUMBO_TAG_DATA,
210  GUMBO_TAG_TIME,
211  GUMBO_TAG_CODE,
212  GUMBO_TAG_VAR,
213  GUMBO_TAG_SAMP,
214  GUMBO_TAG_KBD,
215  GUMBO_TAG_SUB,
216  GUMBO_TAG_SUP,
217  GUMBO_TAG_I,
218  GUMBO_TAG_B,
219  GUMBO_TAG_U,
220  GUMBO_TAG_MARK,
221  GUMBO_TAG_RUBY,
222  GUMBO_TAG_RT,
223  GUMBO_TAG_RP,
224  GUMBO_TAG_BDI,
225  GUMBO_TAG_BDO,
226  GUMBO_TAG_SPAN,
227  GUMBO_TAG_BR,
228  GUMBO_TAG_WBR,
229  // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
230  GUMBO_TAG_INS,
231  GUMBO_TAG_DEL,
232  // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
233  GUMBO_TAG_IMAGE,
234  GUMBO_TAG_IMG,
235  GUMBO_TAG_IFRAME,
236  GUMBO_TAG_EMBED,
237  GUMBO_TAG_OBJECT,
238  GUMBO_TAG_PARAM,
239  GUMBO_TAG_VIDEO,
240  GUMBO_TAG_AUDIO,
241  GUMBO_TAG_SOURCE,
242  GUMBO_TAG_TRACK,
243  GUMBO_TAG_CANVAS,
244  GUMBO_TAG_MAP,
245  GUMBO_TAG_AREA,
246  // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
247  GUMBO_TAG_MATH,
248  GUMBO_TAG_MI,
249  GUMBO_TAG_MO,
250  GUMBO_TAG_MN,
251  GUMBO_TAG_MS,
252  GUMBO_TAG_MTEXT,
253  GUMBO_TAG_MGLYPH,
254  GUMBO_TAG_MALIGNMARK,
255  GUMBO_TAG_ANNOTATION_XML,
256  // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
257  GUMBO_TAG_SVG,
258  GUMBO_TAG_FOREIGNOBJECT,
259  GUMBO_TAG_DESC,
260  // SVG title tags will have GUMBO_TAG_TITLE as with HTML.
261  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
262  GUMBO_TAG_TABLE,
263  GUMBO_TAG_CAPTION,
264  GUMBO_TAG_COLGROUP,
265  GUMBO_TAG_COL,
266  GUMBO_TAG_TBODY,
267  GUMBO_TAG_THEAD,
268  GUMBO_TAG_TFOOT,
269  GUMBO_TAG_TR,
270  GUMBO_TAG_TD,
271  GUMBO_TAG_TH,
272  // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
273  GUMBO_TAG_FORM,
274  GUMBO_TAG_FIELDSET,
275  GUMBO_TAG_LEGEND,
276  GUMBO_TAG_LABEL,
277  GUMBO_TAG_INPUT,
278  GUMBO_TAG_BUTTON,
279  GUMBO_TAG_SELECT,
280  GUMBO_TAG_DATALIST,
281  GUMBO_TAG_OPTGROUP,
282  GUMBO_TAG_OPTION,
283  GUMBO_TAG_TEXTAREA,
284  GUMBO_TAG_KEYGEN,
285  GUMBO_TAG_OUTPUT,
286  GUMBO_TAG_PROGRESS,
287  GUMBO_TAG_METER,
288  // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
289  GUMBO_TAG_DETAILS,
290  GUMBO_TAG_SUMMARY,
291  GUMBO_TAG_MENU,
292  GUMBO_TAG_MENUITEM,
293  // Non-conforming elements that nonetheless appear in the HTML5 spec.
294  // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
295  GUMBO_TAG_APPLET,
296  GUMBO_TAG_ACRONYM,
297  GUMBO_TAG_BGSOUND,
298  GUMBO_TAG_DIR,
299  GUMBO_TAG_FRAME,
300  GUMBO_TAG_FRAMESET,
301  GUMBO_TAG_NOFRAMES,
302  GUMBO_TAG_ISINDEX,
303  GUMBO_TAG_LISTING,
304  GUMBO_TAG_XMP,
305  GUMBO_TAG_NEXTID,
306  GUMBO_TAG_NOEMBED,
307  GUMBO_TAG_PLAINTEXT,
308  GUMBO_TAG_RB,
309  GUMBO_TAG_STRIKE,
310  GUMBO_TAG_BASEFONT,
311  GUMBO_TAG_BIG,
312  GUMBO_TAG_BLINK,
313  GUMBO_TAG_CENTER,
314  GUMBO_TAG_FONT,
315  GUMBO_TAG_MARQUEE,
316  GUMBO_TAG_MULTICOL,
317  GUMBO_TAG_NOBR,
318  GUMBO_TAG_SPACER,
319  GUMBO_TAG_TT,
320  // Used for all tags that don't have special handling in HTML.
321  GUMBO_TAG_UNKNOWN,
322  // A marker value to indicate the end of the enum, for iterating over it.
323  // Also used as the terminator for varargs functions that take tags.
324  GUMBO_TAG_LAST,
325 } GumboTag;
326 
332 const char* gumbo_normalized_tagname(GumboTag tag);
333 
345 
358 const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
359 
364 GumboTag gumbo_tag_enum(const char* tagname);
365 
371 typedef enum {
372  GUMBO_ATTR_NAMESPACE_NONE,
373  GUMBO_ATTR_NAMESPACE_XLINK,
374  GUMBO_ATTR_NAMESPACE_XML,
375  GUMBO_ATTR_NAMESPACE_XMLNS,
377 
383 typedef struct {
391 
396  const char* name;
397 
403 
410  const char* value;
411 
421 
424 
431 
434 
438 
444 GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
445 
450 typedef enum {
463 } GumboNodeType;
464 
469 typedef struct GumboInternalNode GumboNode;
470 
472 typedef enum {
473  GUMBO_DOCTYPE_NO_QUIRKS,
474  GUMBO_DOCTYPE_QUIRKS,
475  GUMBO_DOCTYPE_LIMITED_QUIRKS
477 
485 typedef enum {
486  GUMBO_NAMESPACE_HTML,
487  GUMBO_NAMESPACE_SVG,
488  GUMBO_NAMESPACE_MATHML
490 
499 typedef enum {
505 
513 
526 
527  // Value 1 << 2 was for a flag that has since been removed.
528 
534 
542 
545 
548 
555 
558 
561 
568 
569 
573 typedef struct {
579  GumboVector /* GumboNode* */ children;
580 
581  // True if there was an explicit doctype token as opposed to it being omitted.
582  bool has_doctype;
583 
584  // Fields from the doctype token, copied verbatim.
585  const char* name;
586  const char* public_identifier;
587  const char* system_identifier;
588 
594 } GumboDocument;
595 
600 typedef struct {
605  const char* text;
606 
612 
618 } GumboText;
619 
624 typedef struct {
629  GumboVector /* GumboNode* */ children;
630 
633 
636 
644 
651 
654 
657 
662  GumboVector /* GumboAttribute* */ attributes;
663 } GumboElement;
664 
672 
674  GumboNode* parent;
675 
678 
685 
687  union {
688  GumboDocument document; // For GUMBO_NODE_DOCUMENT.
689  GumboElement element; // For GUMBO_NODE_ELEMENT.
690  GumboText text; // For everything else.
691  } v;
692 };
693 
700 // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
701 typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
702 
707 typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
708 
715 typedef struct GumboInternalOptions {
718 
721 
726  void* userdata;
727 
732  int tab_stop;
733 
739 
748 } GumboOptions;
749 
752 
754 typedef struct GumboInternalOutput {
759  GumboNode* document;
760 
765  GumboNode* root;
766 
774  GumboVector /* GumboError */ errors;
775 } GumboOutput;
776 
784 GumboOutput* gumbo_parse(const char* buffer);
785 
791  const GumboOptions* options, const char* buffer, size_t buffer_length);
792 
795  const GumboOptions* options, GumboOutput* output);
796 
797 
798 #ifdef __cplusplus
799 }
800 #endif
801 
802 #endif // GUMBO_GUMBO_H_
Definition: gumbo.h:754
GumboNamespaceEnum tag_namespace
Definition: gumbo.h:635
GumboSourcePosition value_start
Definition: gumbo.h:433
Definition: gumbo.h:715
Definition: gumbo.h:458
Definition: gumbo.h:573
Definition: gumbo.h:624
GumboSourcePosition start_pos
Definition: gumbo.h:653
GumboSourcePosition name_end
Definition: gumbo.h:430
const char * value
Definition: gumbo.h:410
GumboVector children
Definition: gumbo.h:629
Definition: gumbo.h:533
void(* GumboDeallocatorFunction)(void *userdata, void *ptr)
Definition: gumbo.h:707
GumboStringPiece original_value
Definition: gumbo.h:420
Definition: gumbo.h:566
GumboSourcePosition end_pos
Definition: gumbo.h:656
Definition: gumbo.h:547
GumboOutput * gumbo_parse(const char *buffer)
Definition: gumbo.h:600
unsigned int length
Definition: gumbo.h:126
GumboParseFlags parse_flags
Definition: gumbo.h:684
bool stop_on_first_error
Definition: gumbo.h:738
const GumboVector kGumboEmptyVector
GumboSourcePosition value_end
Definition: gumbo.h:436
GumboNodeType type
Definition: gumbo.h:671
int gumbo_vector_index_of(GumboVector *vector, void *element)
GumboTag
Definition: gumbo.h:154
void ** data
Definition: gumbo.h:123
GumboTag tag
Definition: gumbo.h:632
Definition: gumbo.h:119
GumboStringPiece original_tag
Definition: gumbo.h:643
Definition: gumbo.h:383
GumboAllocatorFunction allocator
Definition: gumbo.h:717
GumboStringPiece original_end_tag
Definition: gumbo.h:650
Definition: gumbo.h:456
const char * text
Definition: gumbo.h:605
Definition: gumbo.h:544
Definition: gumbo.h:62
GumboQuirksModeEnum doc_type_quirks_mode
Definition: gumbo.h:593
void *(* GumboAllocatorFunction)(void *userdata, size_t size)
Definition: gumbo.h:701
const GumboOptions kGumboDefaultOptions
void gumbo_destroy_output(const GumboOptions *options, GumboOutput *output)
Definition: gumbo.h:669
unsigned int capacity
Definition: gumbo.h:129
int max_errors
Definition: gumbo.h:747
GumboStringPiece original_name
Definition: gumbo.h:402
GumboNode * root
Definition: gumbo.h:765
size_t index_within_parent
Definition: gumbo.h:677
bool gumbo_string_equals_ignore_case(const GumboStringPiece *str1, const GumboStringPiece *str2)
void * userdata
Definition: gumbo.h:726
GumboStringPiece original_text
Definition: gumbo.h:611
const GumboStringPiece kGumboEmptyString
int tab_stop
Definition: gumbo.h:732
GumboParseFlags
Definition: gumbo.h:499
GumboOutput * gumbo_parse_with_options(const GumboOptions *options, const char *buffer, size_t buffer_length)
bool gumbo_string_equals(const GumboStringPiece *str1, const GumboStringPiece *str2)
const GumboSourcePosition kGumboEmptySourcePosition
GumboAttribute * gumbo_get_attribute(const GumboVector *attrs, const char *name)
GumboDeallocatorFunction deallocator
Definition: gumbo.h:720
Definition: gumbo.h:504
GumboNode * document
Definition: gumbo.h:759
GumboNodeType
Definition: gumbo.h:450
const char * data
Definition: gumbo.h:86
Definition: gumbo.h:452
Definition: gumbo.h:454
size_t length
Definition: gumbo.h:89
const char * gumbo_normalized_tagname(GumboTag tag)
const char * name
Definition: gumbo.h:396
union GumboInternalNode::@0 v
GumboVector children
Definition: gumbo.h:579
GumboAttributeNamespaceEnum attr_namespace
Definition: gumbo.h:390
GumboSourcePosition start_pos
Definition: gumbo.h:617
Definition: gumbo.h:462
Definition: gumbo.h:460
void gumbo_tag_from_original_text(GumboStringPiece *text)
Definition: gumbo.h:512
GumboNamespaceEnum
Definition: gumbo.h:485
const char * gumbo_normalize_svg_tagname(const GumboStringPiece *tagname)
Definition: gumbo.h:84
GumboSourcePosition name_start
Definition: gumbo.h:423
GumboVector attributes
Definition: gumbo.h:662
GumboQuirksModeEnum
Definition: gumbo.h:472
GumboVector errors
Definition: gumbo.h:774
Definition: gumbo.h:525
GumboTag gumbo_tag_enum(const char *tagname)
GumboAttributeNamespaceEnum
Definition: gumbo.h:371
GumboNode * parent
Definition: gumbo.h:674
Gumbo: gumbo.h Source File
Gumbo  1.0
A C library for parsing HTML.