Method from javax.swing.text.html.parser.Parser Detail: |
void addString(int c) {
if (strpos == str.length) {
char newstr[] = new char[str.length + 128];
System.arraycopy(str, 0, newstr, 0, str.length);
str = newstr;
}
str[strpos++] = (char)c;
}
Add a char to the string buffer. |
protected void endTag(boolean omitted) {
handleText(stack.tag);
if (omitted && !stack.elem.omitEnd()) {
error("end.missing", stack.elem.getName());
} else if (!stack.terminate()) {
error("end.unexpected", stack.elem.getName());
}
// handle the tag
handleEndTag(stack.tag);
stack = stack.next;
recent = (stack != null) ? stack.elem : null;
}
Handle an end tag. The end tag is popped
from the tag stack. |
protected void error(String err) {
error(err, "?", "?", "?");
}
|
protected void error(String err,
String arg1) {
error(err, arg1, "?", "?");
}
|
protected void error(String err,
String arg1,
String arg2) {
error(err, arg1, arg2, "?");
}
|
protected void error(String err,
String arg1,
String arg2,
String arg3) {
handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
}
Invoke the error handler. |
void errorContext() throws ChangedCharSetException {
for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
handleEndTag(stack.tag);
}
if (stack == null) {
legalElementContext(dtd.body);
startTag(makeTag(dtd.body, true));
}
}
Error context. Something went wrong, make sure we are in
the document's body context |
protected void flushAttributes() {
attributes.removeAttributes(attributes);
}
|
protected SimpleAttributeSet getAttributes() {
return attributes;
}
|
int getBlockStartPosition() {
return Math.max(0, lastBlockStartPos - 1);
}
Returns the start position of the current block. Block is
overloaded here, it really means the current start position for
the current comment tag, text, block.... This is provided for
subclassers that wish to know the start of the current block when
called with one of the handleXXX methods. |
char[] getChars(int pos) {
char newStr[] = new char[strpos - pos];
System.arraycopy(str, pos, newStr, 0, strpos - pos);
strpos = pos;
return newStr;
}
|
char[] getChars(int pos,
int endPos) {
char newStr[] = new char[endPos - pos];
System.arraycopy(str, pos, newStr, 0, endPos - pos);
// REMIND: it's not clear whether this version should set strpos or not
// strpos = pos;
return newStr;
}
|
protected int getCurrentLine() {
return ln;
}
|
protected int getCurrentPos() {
return currentPosition;
}
|
String getEndOfLineString() {
if (crlfCount >= crCount) {
if (lfCount >= crlfCount) {
return "\n";
}
else {
return "\r\n";
}
}
else {
if (crCount > lfCount) {
return "\r";
}
else {
return "\n";
}
}
}
Returns the end of line string. This will return the end of line
string that has been encountered the most, one of \r, \n or \r\n. |
String getString(int pos) {
char newStr[] = new char[strpos - pos];
System.arraycopy(str, pos, newStr, 0, strpos - pos);
strpos = pos;
return new String(newStr);
}
Get the string that's been accumulated. |
protected void handleComment(char[] text) {
}
Called when an HTML comment is encountered. |
protected void handleEOFInComment() {
// We've reached EOF. Our recovery strategy is to
// see if we have more than one line in the comment;
// if so, we pretend that the comment was an unterminated
// single line comment, and reparse the lines after the
// first line as normal HTML content.
int commentEndPos = strIndexOf('\n');
if (commentEndPos >= 0) {
handleComment(getChars(0, commentEndPos));
try {
in.close();
in = new CharArrayReader(getChars(commentEndPos + 1));
ch = ' >';
} catch (IOException e) {
error("ioexception");
}
resetStrBuffer();
} else {
// no newline, so signal an error
error("eof.comment");
}
}
|
protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
}
Called when an empty tag is encountered. |
protected void handleEndTag(TagElement tag) {
}
Called when an end tag is encountered. |
protected void handleError(int ln,
String msg) {
/*
Thread.dumpStack();
System.out.println("**** " + stack);
System.out.println("line " + ln + ": error: " + msg);
System.out.println();
*/
}
|
protected void handleStartTag(TagElement tag) {
}
Called when a start tag is encountered. |
protected void handleText(char[] text) {
}
Called when PCDATA is encountered. |
void handleText(TagElement tag) {
if (tag.breaksFlow()) {
space = false;
if (!strict) {
ignoreSpace = true;
}
}
if (textpos == 0) {
if ((!space) || (stack == null) || last.breaksFlow() ||
!stack.advance(dtd.pcdata)) {
last = tag;
space = false;
lastBlockStartPos = currentBlockStartPos;
return;
}
}
if (space) {
if (!ignoreSpace) {
// enlarge buffer if needed
if (textpos + 1 > text.length) {
char newtext[] = new char[text.length + 200];
System.arraycopy(text, 0, newtext, 0, text.length);
text = newtext;
}
// output pending space
text[textpos++] = ' ';
if (!strict && !tag.getElement().isEmpty()) {
ignoreSpace = true;
}
}
space = false;
}
char newtext[] = new char[textpos];
System.arraycopy(text, 0, newtext, 0, textpos);
// Handles cases of bad html where the title tag
// was getting lost when we did error recovery.
if (tag.getElement().getName().equals("title")) {
handleTitle(newtext);
} else {
handleText(newtext);
}
lastBlockStartPos = currentBlockStartPos;
textpos = 0;
last = tag;
space = false;
}
|
protected void handleTitle(char[] text) {
// default behavior is to call handleText. Subclasses
// can override if necessary.
handleText(text);
}
Called when an HTML title tag is encountered. |
boolean ignoreElement(Element elem) {
String stackElement = stack.elem.getName();
String elemName = elem.getName();
/* We ignore all elements that are not valid in the context of
a table except < td >, < th > (these we handle in
legalElementContext()) and #pcdata. We also ignore the
< font > tag in the context of < ul > and < ol > We additonally
ignore the < meta > and the < style > tag if the body tag has
been seen. **/
if ((elemName.equals("html") && seenHtml) ||
(elemName.equals("head") && seenHead) ||
(elemName.equals("body") && seenBody)) {
return true;
}
if (elemName.equals("dt") || elemName.equals("dd")) {
TagStack s = stack;
while (s != null && !s.elem.getName().equals("dl")) {
s = s.next;
}
if (s == null) {
return true;
}
}
if (((stackElement.equals("table")) &&
(!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
((elemName.equals("font")) &&
(stackElement.equals("ul") || stackElement.equals("ol"))) ||
(elemName.equals("meta") && stack != null) ||
(elemName.equals("style") && seenBody) ||
(stackElement.equals("table") && elemName.equals("a"))) {
return true;
}
return false;
}
|
boolean legalElementContext(Element elem) throws ChangedCharSetException {
// System.out.println("-- legalContext -- " + elem);
// Deal with the empty stack
if (stack == null) {
// System.out.println("-- stack is empty");
if (elem != dtd.html) {
// System.out.println("-- pushing html");
startTag(makeTag(dtd.html, true));
return legalElementContext(elem);
}
return true;
}
// Is it allowed in the current context
if (stack.advance(elem)) {
// System.out.println("-- legal context");
markFirstTime(elem);
return true;
}
boolean insertTag = false;
// The use of all error recovery strategies are contingent
// on the value of the strict property.
//
// These are commonly occuring errors. if insertTag is true,
// then we want to adopt an error recovery strategy that
// involves attempting to insert an additional tag to
// legalize the context. The two errors addressed here
// are:
// 1) when a < td > or < th > is seen soon after a < table > tag.
// In this case we insert a < tr >.
// 2) when any other tag apart from a < tr > is seen
// in the context of a < tr >. In this case we would
// like to add a < td >. If a < tr > is seen within a
// < tr > context, then we will close out the current
// < tr >.
//
// This insertion strategy is handled later in the method.
// The reason for checking this now, is that in other cases
// we would like to apply other error recovery strategies for example
// ignoring tags.
//
// In certain cases it is better to ignore a tag than try to
// fix the situation. So the first test is to see if this
// is what we need to do.
//
String stackElemName = stack.elem.getName();
String elemName = elem.getName();
if (!strict &&
((stackElemName.equals("table") && elemName.equals("td")) ||
(stackElemName.equals("table") && elemName.equals("th")) ||
(stackElemName.equals("tr") && !elemName.equals("tr")))){
insertTag = true;
}
if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
elem.getName().equals("body"))) {
if (skipTag = ignoreElement(elem)) {
error("tag.ignore", elem.getName());
return skipTag;
}
}
// Check for anything after the start of the table besides tr, td, th
// or caption, and if those aren't there, insert the < tr > and call
// legalElementContext again.
if (!strict && stackElemName.equals("table") &&
!elemName.equals("tr") && !elemName.equals("td") &&
!elemName.equals("th") && !elemName.equals("caption")) {
Element e = dtd.getElement("tr");
TagElement t = makeTag(e, true);
legalTagContext(t);
startTag(t);
error("start.missing", elem.getName());
return legalElementContext(elem);
}
// They try to find a legal context by checking if the current
// tag is valid in an enclosing context. If so
// close out the tags by outputing end tags and then
// insert the curent tag. If the tags that are
// being closed out do not have an optional end tag
// specification in the DTD then an html error is
// reported.
//
if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
for (TagStack s = stack.next ; s != null ; s = s.next) {
if (s.advance(elem)) {
while (stack != s) {
endTag(true);
}
return true;
}
if (!s.terminate() || (strict && !s.elem.omitEnd())) {
break;
}
}
}
// Check if we know what tag is expected next.
// If so insert the tag. Report an error if the
// tag does not have its start tag spec in the DTD as optional.
//
Element next = stack.first();
if (next != null && (!strict || next.omitStart()) &&
!(next==dtd.head && elem==dtd.pcdata) ) {
// System.out.println("-- omitting start tag: " + next);
TagElement t = makeTag(next, true);
legalTagContext(t);
startTag(t);
if (!next.omitStart()) {
error("start.missing", elem.getName());
}
return legalElementContext(elem);
}
// Traverse the list of expected elements and determine if adding
// any of these elements would make for a legal context.
//
if (!strict) {
ContentModel content = stack.contentModel();
Vector< Element > elemVec = new Vector< Element >();
if (content != null) {
content.getElements(elemVec);
for (Element e : elemVec) {
// Ensure that this element has not been included as
// part of the exclusions in the DTD.
//
if (stack.excluded(e.getIndex())) {
continue;
}
boolean reqAtts = false;
for (AttributeList a = e.getAttributes(); a != null ; a = a.next) {
if (a.modifier == REQUIRED) {
reqAtts = true;
break;
}
}
// Ensure that no tag that has required attributes
// gets inserted.
//
if (reqAtts) {
continue;
}
ContentModel m = e.getContent();
if (m != null && m.first(elem)) {
// System.out.println("-- adding a legal tag: " + e);
TagElement t = makeTag(e, true);
legalTagContext(t);
startTag(t);
error("start.missing", e.getName());
return legalElementContext(elem);
}
}
}
}
// Check if the stack can be terminated. If so add the appropriate
// end tag. Report an error if the tag being ended does not have its
// end tag spec in the DTD as optional.
//
if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
// System.out.println("-- omitting end tag: " + stack.elem);
if (!stack.elem.omitEnd()) {
error("end.missing", elem.getName());
}
endTag(true);
return legalElementContext(elem);
}
// At this point we know that something is screwed up.
return false;
}
Create a legal content for an element. |
void legalTagContext(TagElement tag) throws ChangedCharSetException {
if (legalElementContext(tag.getElement())) {
markFirstTime(tag.getElement());
return;
}
// Avoid putting a block tag in a flow tag.
if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
endTag(true);
legalTagContext(tag);
return;
}
// Avoid putting something wierd in the head of the document.
for (TagStack s = stack ; s != null ; s = s.next) {
if (s.tag.getElement() == dtd.head) {
while (stack != s) {
endTag(true);
}
endTag(true);
legalTagContext(tag);
return;
}
}
// Everything failed
error("tag.unexpected", tag.getElement().getName());
}
Create a legal context for a tag. |
protected TagElement makeTag(Element elem) {
return makeTag(elem, false);
}
|
protected TagElement makeTag(Element elem,
boolean fictional) {
return new TagElement(elem, fictional);
}
|
protected void markFirstTime(Element elem) {
String elemName = elem.getName();
if (elemName.equals("html")) {
seenHtml = true;
} else if (elemName.equals("head")) {
seenHead = true;
} else if (elemName.equals("body")) {
if (buf.length == 1) {
// Refer to note in definition of buf for details on this.
char[] newBuf = new char[256];
newBuf[0] = buf[0];
buf = newBuf;
}
seenBody = true;
}
}
Marks the first time a tag has been seen in a document |
public synchronized void parse(Reader in) throws IOException {
this.in = in;
this.ln = 1;
seenHtml = false;
seenHead = false;
seenBody = false;
crCount = lfCount = crlfCount = 0;
try {
ch = readCh();
text = new char[1024];
str = new char[128];
parseContent();
// NOTE: interruption may have occurred. Control flows out
// of here normally.
while (stack != null) {
endTag(true);
}
in.close();
} catch (IOException e) {
errorContext();
error("ioexception");
throw e;
} catch (Exception e) {
errorContext();
error("exception", e.getClass().getName(), e.getMessage());
e.printStackTrace();
} catch (ThreadDeath e) {
errorContext();
error("terminated");
e.printStackTrace();
throw e;
} finally {
for (; stack != null ; stack = stack.next) {
handleEndTag(stack.tag);
}
text = null;
str = null;
}
}
Parse an HTML stream, given a DTD. |
void parseAttributeSpecificationList(Element elem) throws IOException {
while (true) {
skipSpace();
switch (ch) {
case '/':
case ' >':
case '< ':
case -1:
return;
case '-':
if ((ch = readCh()) == '-') {
ch = readCh();
parseComment();
strpos = 0;
} else {
error("invalid.tagchar", "-", elem.getName());
ch = readCh();
}
continue;
}
AttributeList att;
String attname;
String attvalue;
if (parseIdentifier(true)) {
attname = getString(0);
skipSpace();
if (ch == '=') {
ch = readCh();
skipSpace();
att = elem.getAttribute(attname);
// Bug ID 4102750
// Load the NAME of an Attribute Case Sensitive
// The case of the NAME must be intact
// MG 021898
attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
// attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
} else {
attvalue = attname;
att = elem.getAttributeByValue(attvalue);
if (att == null) {
att = elem.getAttribute(attname);
if (att != null) {
attvalue = att.getValue();
}
else {
// Make it null so that NULL_ATTRIBUTE_VALUE is
// used
attvalue = null;
}
}
}
} else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
ch = readCh();
continue;
} else if (!strict && ch == '"') { // allows for quoted attributes
ch = readCh();
skipSpace();
if (parseIdentifier(true)) {
attname = getString(0);
if (ch == '"') {
ch = readCh();
}
skipSpace();
if (ch == '=') {
ch = readCh();
skipSpace();
att = elem.getAttribute(attname);
attvalue = parseAttributeValue((att != null) &&
(att.type != CDATA) &&
(att.type != NOTATION));
} else {
attvalue = attname;
att = elem.getAttributeByValue(attvalue);
if (att == null) {
att = elem.getAttribute(attname);
if (att != null) {
attvalue = att.getValue();
}
}
}
} else {
char str[] = {(char)ch};
error("invalid.tagchar", new String(str), elem.getName());
ch = readCh();
continue;
}
} else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
ch = readCh();
skipSpace();
attname = elem.getName();
att = elem.getAttribute(attname);
attvalue = parseAttributeValue((att != null) &&
(att.type != CDATA) &&
(att.type != NOTATION));
} else if (!strict && (ch == '=')) {
ch = readCh();
skipSpace();
attvalue = parseAttributeValue(true);
error("attvalerr");
return;
} else {
char str[] = {(char)ch};
error("invalid.tagchar", new String(str), elem.getName());
if (!strict) {
ch = readCh();
continue;
} else {
return;
}
}
if (att != null) {
attname = att.getName();
} else {
error("invalid.tagatt", attname, elem.getName());
}
// Check out the value
if (attributes.isDefined(attname)) {
error("multi.tagatt", attname, elem.getName());
}
if (attvalue == null) {
attvalue = ((att != null) && (att.value != null)) ? att.value :
HTML.NULL_ATTRIBUTE_VALUE;
} else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
error("invalid.tagattval", attname, elem.getName());
}
HTML.Attribute attkey = HTML.getAttributeKey(attname);
if (attkey == null) {
attributes.addAttribute(attname, attvalue);
} else {
attributes.addAttribute(attkey, attvalue);
}
}
}
Parse attribute specification List. [31] 327:17 |
String parseAttributeValue(boolean lower) throws IOException {
int delim = -1;
// Check for a delimiter
switch(ch) {
case '\'':
case '"':
delim = ch;
ch = readCh();
break;
}
// Parse the rest of the value
while (true) {
int c = ch;
switch (c) {
case '\n':
ln++;
ch = readCh();
lfCount++;
if (delim < 0) {
return getString(0);
}
break;
case '\r':
ln++;
if ((ch = readCh()) == '\n') {
ch = readCh();
crlfCount++;
}
else {
crCount++;
}
if (delim < 0) {
return getString(0);
}
break;
case '\t':
if (delim < 0)
c = ' ';
case ' ':
ch = readCh();
if (delim < 0) {
return getString(0);
}
break;
case ' >':
case '< ':
if (delim < 0) {
return getString(0);
}
ch = readCh();
break;
case '\'':
case '"':
ch = readCh();
if (c == delim) {
return getString(0);
} else if (delim == -1) {
error("attvalerr");
if (strict || ch == ' ') {
return getString(0);
} else {
continue;
}
}
break;
case '=':
if (delim < 0) {
/* In SGML a construct like < img src=/cgi-bin/foo?x=1 >
is considered invalid since an = sign can only be contained
in an attributes value if the string is quoted.
*/
error("attvalerr");
/* If strict is true then we return with the string we have thus far.
Otherwise we accept the = sign as part of the attribute's value and
process the rest of the img tag. */
if (strict) {
return getString(0);
}
}
ch = readCh();
break;
case '&':
if (strict && delim < 0) {
ch = readCh();
break;
}
char data[] = parseEntityReference();
for (int i = 0 ; i < data.length ; i++) {
c = data[i];
addString((lower && (c >= 'A') && (c < = 'Z')) ? 'a' + c - 'A' : c);
}
continue;
case -1:
return getString(0);
default:
if (lower && (c >= 'A') && (c < = 'Z')) {
c = 'a' + c - 'A';
}
ch = readCh();
break;
}
addString(c);
}
}
Parse attribute value. [33] 331:1 |
void parseComment() throws IOException {
while (true) {
int c = ch;
switch (c) {
case '-':
/** Presuming that the start string of a comment "< !--" has
already been parsed, the '-' character is valid only as
part of a comment termination and further more it must
be present in even numbers. Hence if strict is true, we
presume the comment has been terminated and return.
However if strict is false, then there is no even number
requirement and this character can appear anywhere in the
comment. The parser reads on until it sees the following
pattern: "-- >" or "--! >".
**/
if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
if ((ch = readCh()) == ' >') {
return;
}
if (ch == '!') {
if ((ch = readCh()) == ' >') {
return;
} else {
/* to account for extra read()'s that happened */
addString('-');
addString('!');
continue;
}
}
break;
}
if ((ch = readCh()) == '-') {
ch = readCh();
if (strict || ch == ' >') {
return;
}
if (ch == '!') {
if ((ch = readCh()) == ' >') {
return;
} else {
/* to account for extra read()'s that happened */
addString('-');
addString('!');
continue;
}
}
/* to account for the extra read() */
addString('-');
}
break;
case -1:
handleEOFInComment();
return;
case '\n':
ln++;
ch = readCh();
lfCount++;
break;
case ' >':
ch = readCh();
break;
case '\r':
ln++;
if ((ch = readCh()) == '\n') {
ch = readCh();
crlfCount++;
}
else {
crCount++;
}
c = '\n';
break;
default:
ch = readCh();
break;
}
addString(c);
}
}
Parse a comment. [92] 391:7 |
void parseContent() throws IOException {
Thread curThread = Thread.currentThread();
for (;;) {
if (curThread.isInterrupted()) {
curThread.interrupt(); // resignal the interrupt
break;
}
int c = ch;
currentBlockStartPos = currentPosition;
if (recent == dtd.script) { // means: if after starting < script > tag
/* Here, ch has to be the first character after < script > */
parseScript();
last = makeTag(dtd.getElement("comment"), true);
/* Remove leading and trailing HTML comment declarations */
String str = new String(getChars(0)).trim();
int minLength = START_COMMENT.length() + END_COMMENT.length();
if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT)
&& str.length() >= (minLength)) {
str = str.substring(START_COMMENT.length(),
str.length() - END_COMMENT.length());
}
/* Handle resulting chars as comment */
handleComment(str.toCharArray());
endTag(false);
lastBlockStartPos = currentPosition;
} else {
switch (c) {
case '< ':
parseTag();
lastBlockStartPos = currentPosition;
continue;
case '/':
ch = readCh();
if ((stack != null) && stack.net) {
// null end tag.
endTag(false);
continue;
}
break;
case -1:
return;
case '&':
if (textpos == 0) {
if (!legalElementContext(dtd.pcdata)) {
error("unexpected.pcdata");
}
if (last.breaksFlow()) {
space = false;
}
}
char data[] = parseEntityReference();
if (textpos + data.length + 1 > text.length) {
char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
System.arraycopy(text, 0, newtext, 0, text.length);
text = newtext;
}
if (space) {
space = false;
text[textpos++] = ' ';
}
System.arraycopy(data, 0, text, textpos, data.length);
textpos += data.length;
ignoreSpace = false;
continue;
case '\n':
ln++;
lfCount++;
ch = readCh();
if ((stack != null) && stack.pre) {
break;
}
if (textpos == 0) {
lastBlockStartPos = currentPosition;
}
if (!ignoreSpace) {
space = true;
}
continue;
case '\r':
ln++;
c = '\n';
if ((ch = readCh()) == '\n') {
ch = readCh();
crlfCount++;
}
else {
crCount++;
}
if ((stack != null) && stack.pre) {
break;
}
if (textpos == 0) {
lastBlockStartPos = currentPosition;
}
if (!ignoreSpace) {
space = true;
}
continue;
case '\t':
case ' ':
ch = readCh();
if ((stack != null) && stack.pre) {
break;
}
if (textpos == 0) {
lastBlockStartPos = currentPosition;
}
if (!ignoreSpace) {
space = true;
}
continue;
default:
if (textpos == 0) {
if (!legalElementContext(dtd.pcdata)) {
error("unexpected.pcdata");
}
if (last.breaksFlow()) {
space = false;
}
}
ch = readCh();
break;
}
}
// enlarge buffer if needed
if (textpos + 2 > text.length) {
char newtext[] = new char[text.length + 128];
System.arraycopy(text, 0, newtext, 0, text.length);
text = newtext;
}
// output pending space
if (space) {
if (textpos == 0) {
lastBlockStartPos--;
}
text[textpos++] = ' ';
space = false;
}
text[textpos++] = (char)c;
ignoreSpace = false;
}
}
Parse Content. [24] 320:1 |
public String parseDTDMarkup() throws IOException {
StringBuilder strBuff = new StringBuilder();
ch = readCh();
while(true) {
switch (ch) {
case ' >':
ch = readCh();
return strBuff.toString();
case -1:
error("invalid.markup");
return strBuff.toString();
case '\n':
ln++;
ch = readCh();
lfCount++;
break;
case '"':
ch = readCh();
break;
case '\r':
ln++;
if ((ch = readCh()) == '\n') {
ch = readCh();
crlfCount++;
}
else {
crCount++;
}
break;
default:
strBuff.append((char)(ch & 0xFF));
ch = readCh();
break;
}
}
}
Parses th Document Declaration Type markup declaration.
Currently ignores it. |
boolean parseIdentifier(boolean lower) throws IOException {
switch (ch) {
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
if (lower) {
ch = 'a' + (ch - 'A');
}
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
case 'y': case 'z':
break;
default:
return false;
}
while (true) {
addString(ch);
switch (ch = readCh()) {
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
if (lower) {
ch = 'a' + (ch - 'A');
}
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
case 'y': case 'z':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
case '.': case '-':
case '_': // not officially allowed
break;
default:
return true;
}
}
}
Parse identifier. Uppercase characters are folded
to lowercase when lower is true. Returns falsed if
no identifier is found. [55] 346:17 |
void parseInvalidTag() throws IOException {
// ignore all data upto the close bracket ' >'
while (true) {
skipSpace();
switch (ch) {
case ' >':
case -1:
ch = readCh();
return;
case '< ':
return;
default:
ch = readCh();
}
}
}
|
void parseLiteral(boolean replace) throws IOException {
while (true) {
int c = ch;
switch (c) {
case -1:
error("eof.literal", stack.elem.getName());
endTag(true);
return;
case ' >':
ch = readCh();
int i = textpos - (stack.elem.name.length() + 2), j = 0;
// match end tag
if ((i >= 0) && (text[i++] == '< ') && (text[i] == '/')) {
while ((++i < textpos) &&
(Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
if (i == textpos) {
textpos -= (stack.elem.name.length() + 2);
if ((textpos > 0) && (text[textpos-1] == '\n')) {
textpos--;
}
endTag(false);
return;
}
}
break;
case '&':
char data[] = parseEntityReference();
if (textpos + data.length > text.length) {
char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
System.arraycopy(text, 0, newtext, 0, text.length);
text = newtext;
}
System.arraycopy(data, 0, text, textpos, data.length);
textpos += data.length;
continue;
case '\n':
ln++;
ch = readCh();
lfCount++;
break;
case '\r':
ln++;
if ((ch = readCh()) == '\n') {
ch = readCh();
crlfCount++;
}
else {
crCount++;
}
c = '\n';
break;
default:
ch = readCh();
break;
}
// output character
if (textpos == text.length) {
char newtext[] = new char[text.length + 128];
System.arraycopy(text, 0, newtext, 0, text.length);
text = newtext;
}
text[textpos++] = (char)c;
}
}
Parse literal content. [46] 343:1 and [47] 344:1 |
protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException {
/* Currently handles only the DOCTYPE */
if ((strBuff.length() == "DOCTYPE".length()) &&
(strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
parseDTDMarkup();
return true;
}
return false;
}
Parse markup declarations.
Currently only handles the Document Type Declaration markup.
Returns true if it is a markup declaration false otherwise. |
void parseScript() throws IOException {
char[] charsToAdd = new char[SCRIPT_END_TAG.length];
/* Here, ch should be the first character after < script > */
while (true) {
int i = 0;
while (i < SCRIPT_END_TAG.length
&& (SCRIPT_END_TAG[i] == ch
|| SCRIPT_END_TAG_UPPER_CASE[i] == ch)) {
charsToAdd[i] = (char) ch;
ch = readCh();
i++;
}
if (i == SCRIPT_END_TAG.length) {
/* '< /script >' tag detected */
/* Here, ch == ' >' */
ch = readCh();
/* Here, ch == the first character after < /script > */
return;
} else {
/* To account for extra read()'s that happened */
for (int j = 0; j < i; j++) {
addString(charsToAdd[j]);
}
switch (ch) {
case -1:
error("eof.script");
return;
case '\n':
ln++;
ch = readCh();
lfCount++;
addString('\n');
break;
case '\r':
ln++;
if ((ch = readCh()) == '\n') {
ch = readCh();
crlfCount++;
} else {
crCount++;
}
addString('\n');
break;
default:
addString(ch);
ch = readCh();
break;
} // switch
}
} // while
}
|
void parseTag() throws IOException {
Element elem;
boolean net = false;
boolean warned = false;
boolean unknown = false;
switch (ch = readCh()) {
case '!':
switch (ch = readCh()) {
case '-':
// Parse comment. [92] 391:7
while (true) {
if (ch == '-') {
if (!strict || ((ch = readCh()) == '-')) {
ch = readCh();
if (!strict && ch == '-') {
ch = readCh();
}
// send over any text you might see
// before parsing and sending the
// comment
if (textpos != 0) {
char newtext[] = new char[textpos];
System.arraycopy(text, 0, newtext, 0, textpos);
handleText(newtext);
lastBlockStartPos = currentBlockStartPos;
textpos = 0;
}
parseComment();
last = makeTag(dtd.getElement("comment"), true);
handleComment(getChars(0));
continue;
} else if (!warned) {
warned = true;
error("invalid.commentchar", "-");
}
}
skipSpace();
switch (ch) {
case '-':
continue;
case ' >':
ch = readCh();
case -1:
return;
default:
ch = readCh();
if (!warned) {
warned = true;
error("invalid.commentchar",
String.valueOf((char)ch));
}
break;
}
}
default:
// deal with marked sections
StringBuffer strBuff = new StringBuffer();
while (true) {
strBuff.append((char)ch);
if (parseMarkupDeclarations(strBuff)) {
return;
}
switch(ch) {
case ' >':
ch = readCh();
case -1:
error("invalid.markup");
return;
case '\n':
ln++;
ch = readCh();
lfCount++;
break;
case '\r':
ln++;
if ((ch = readCh()) == '\n') {
ch = readCh();
crlfCount++;
}
else {
crCount++;
}
break;
default:
ch = readCh();
break;
}
}
}
case '/':
// parse end tag [19] 317:4
switch (ch = readCh()) {
case ' >':
ch = readCh();
case '< ':
// empty end tag. either < / > or < /<
if (recent == null) {
error("invalid.shortend");
return;
}
elem = recent;
break;
default:
if (!parseIdentifier(true)) {
error("expected.endtagname");
return;
}
skipSpace();
switch (ch) {
case ' >':
ch = readCh();
case '< ':
break;
default:
error("expected", "' >'");
while ((ch != -1) && (ch != '\n') && (ch != ' >')) {
ch = readCh();
}
if (ch == ' >') {
ch = readCh();
}
break;
}
String elemStr = getString(0);
if (!dtd.elementExists(elemStr)) {
error("end.unrecognized", elemStr);
// Ignore RE before end tag
if ((textpos > 0) && (text[textpos-1] == '\n')) {
textpos--;
}
elem = dtd.getElement("unknown");
elem.name = elemStr;
unknown = true;
} else {
elem = dtd.getElement(elemStr);
}
break;
}
// If the stack is null, we're seeing end tags without any begin
// tags. Ignore them.
if (stack == null) {
error("end.extra.tag", elem.getName());
return;
}
// Ignore RE before end tag
if ((textpos > 0) && (text[textpos-1] == '\n')) {
// In a pre tag, if there are blank lines
// we do not want to remove the newline
// before the end tag. Hence this code.
//
if (stack.pre) {
if ((textpos > 1) && (text[textpos-2] != '\n')) {
textpos--;
}
} else {
textpos--;
}
}
// If the end tag is a form, since we did not put it
// on the tag stack, there is no corresponding start
// start tag to find. Hence do not touch the tag stack.
//
/*
if (!strict && elem.getName().equals("form")) {
if (lastFormSent != null) {
handleEndTag(lastFormSent);
return;
} else {
// do nothing.
return;
}
}
*/
if (unknown) {
// we will not see a corresponding start tag
// on the the stack. If we are seeing an
// end tag, lets send this on as an empty
// tag with the end tag attribute set to
// true.
TagElement t = makeTag(elem);
handleText(t);
attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
handleEmptyTag(makeTag(elem));
unknown = false;
return;
}
// find the corresponding start tag
// A commonly occuring error appears to be the insertion
// of extra end tags in a table. The intent here is ignore
// such extra end tags.
//
if (!strict) {
String stackElem = stack.elem.getName();
if (stackElem.equals("table")) {
// If it isnt a valid end tag ignore it and return
//
if (!elem.getName().equals(stackElem)) {
error("tag.ignore", elem.getName());
return;
}
}
if (stackElem.equals("tr") ||
stackElem.equals("td")) {
if ((!elem.getName().equals("table")) &&
(!elem.getName().equals(stackElem))) {
error("tag.ignore", elem.getName());
return;
}
}
}
TagStack sp = stack;
while ((sp != null) && (elem != sp.elem)) {
sp = sp.next;
}
if (sp == null) {
error("unmatched.endtag", elem.getName());
return;
}
// People put font ending tags in the darndest places.
// Don't close other contexts based on them being between
// a font tag and the corresponding end tag. Instead,
// ignore the end tag like it doesn't exist and allow the end
// of the document to close us out.
String elemName = elem.getName();
if (stack != sp &&
(elemName.equals("font") ||
elemName.equals("center"))) {
// Since closing out a center tag can have real wierd
// effects on the formatting, make sure that tags
// for which omitting an end tag is legimitate
// get closed out.
//
if (elemName.equals("center")) {
while(stack.elem.omitEnd() && stack != sp) {
endTag(true);
}
if (stack.elem == elem) {
endTag(false);
}
}
return;
}
// People do the same thing with center tags. In this
// case we would like to close off the center tag but
// not necessarily all enclosing tags.
// end tags
while (stack != sp) {
endTag(true);
}
endTag(false);
return;
case -1:
error("eof");
return;
}
// start tag [14] 314:1
if (!parseIdentifier(true)) {
elem = recent;
if ((ch != ' >') || (elem == null)) {
error("expected.tagname");
return;
}
} else {
String elemStr = getString(0);
if (elemStr.equals("image")) {
elemStr = "img";
}
/* determine if this element is part of the dtd. */
if (!dtd.elementExists(elemStr)) {
// parseInvalidTag();
error("tag.unrecognized ", elemStr);
elem = dtd.getElement("unknown");
elem.name = elemStr;
unknown = true;
} else {
elem = dtd.getElement(elemStr);
}
}
// Parse attributes
parseAttributeSpecificationList(elem);
switch (ch) {
case '/':
net = true;
case ' >':
ch = readCh();
if (ch == ' >' && net) {
ch = readCh();
}
case '< ':
break;
default:
error("expected", "' >'");
break;
}
if (!strict) {
if (elem.getName().equals("script")) {
error("javascript.unsupported");
}
}
// ignore RE after start tag
//
if (!elem.isEmpty()) {
if (ch == '\n') {
ln++;
lfCount++;
ch = readCh();
} else if (ch == '\r') {
ln++;
if ((ch = readCh()) == '\n') {
ch = readCh();
crlfCount++;
}
else {
crCount++;
}
}
}
// ensure a legal context for the tag
TagElement tag = makeTag(elem, false);
/** In dealing with forms, we have decided to treat
them as legal in any context. Also, even though
they do have a start and an end tag, we will
not put this tag on the stack. This is to deal
several pages in the web oasis that choose to
start and end forms in any possible location. **/
/*
if (!strict && elem.getName().equals("form")) {
if (lastFormSent == null) {
lastFormSent = tag;
} else {
handleEndTag(lastFormSent);
lastFormSent = tag;
}
} else {
*/
// Smlly, if a tag is unknown, we will apply
// no legalTagContext logic to it.
//
if (!unknown) {
legalTagContext(tag);
// If skip tag is true, this implies that
// the tag was illegal and that the error
// recovery strategy adopted is to ignore
// the tag.
if (!strict && skipTag) {
skipTag = false;
return;
}
}
/*
}
*/
startTag(tag);
if (!elem.isEmpty()) {
switch (elem.getType()) {
case CDATA:
parseLiteral(false);
break;
case RCDATA:
parseLiteral(true);
break;
default:
if (stack != null) {
stack.net = net;
}
break;
}
}
}
Parse a start or end tag. |
void resetStrBuffer() {
strpos = 0;
}
|
void skipSpace() throws IOException {
while (true) {
switch (ch) {
case '\n':
ln++;
ch = readCh();
lfCount++;
break;
case '\r':
ln++;
if ((ch = readCh()) == '\n') {
ch = readCh();
crlfCount++;
}
else {
crCount++;
}
break;
case ' ':
case '\t':
ch = readCh();
break;
default:
return;
}
}
}
|
protected void startTag(TagElement tag) throws ChangedCharSetException {
Element elem = tag.getElement();
// If the tag is an empty tag and texpos != 0
// this implies that there is text before the
// start tag that needs to be processed before
// handling the tag.
//
if (!elem.isEmpty() ||
((last != null) && !last.breaksFlow()) ||
(textpos != 0)) {
handleText(tag);
} else {
// this variable gets updated in handleText().
// Since in this case we do not call handleText()
// we need to update it here.
//
last = tag;
// Note that we should really check last.breakFlows before
// assuming this should be false.
space = false;
}
lastBlockStartPos = currentBlockStartPos;
// check required attributes
for (AttributeList a = elem.atts ; a != null ; a = a.next) {
if ((a.modifier == REQUIRED) &&
((attributes.isEmpty()) ||
((!attributes.isDefined(a.name)) &&
(!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
error("req.att ", a.getName(), elem.getName());
}
}
if (elem.isEmpty()) {
handleEmptyTag(tag);
/*
} else if (elem.getName().equals("form")) {
handleStartTag(tag);
*/
} else {
recent = elem;
stack = new TagStack(tag, stack);
handleStartTag(tag);
}
}
Handle a start tag. The new tag is pushed
onto the tag stack. The attribute list is
checked for required attributes. |
int strIndexOf(char target) {
for (int i = 0; i < strpos; i++) {
if (str[i] == target) {
return i;
}
}
return -1;
}
|