Viewing file: CommentLexer.h (10.03 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines lexer for structured comments and supporting token class. // //===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_AST_COMMENTLEXER_H #define LLVM_CLANG_AST_COMMENTLEXER_H
#include "clang/Basic/Diagnostic.h" #include "clang/Basic/SourceManager.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/raw_ostream.h"
namespace clang { namespace comments {
class Lexer; class TextTokenRetokenizer; struct CommandInfo; class CommandTraits;
namespace tok { enum TokenKind { eof, newline, text, unknown_command, // Command that does not have an ID. backslash_command, // Command with an ID, that used backslash marker. at_command, // Command with an ID, that used 'at' marker. verbatim_block_begin, verbatim_block_line, verbatim_block_end, verbatim_line_name, verbatim_line_text, html_start_tag, // <tag html_ident, // attr html_equals, // = html_quoted_string, // "blah\"blah" or 'blah\'blah' html_greater, // > html_slash_greater, // /> html_end_tag // </tag }; } // end namespace tok
/// Comment token. class Token { friend class Lexer; friend class TextTokenRetokenizer;
/// The location of the token. SourceLocation Loc;
/// The actual kind of the token. tok::TokenKind Kind;
/// Integer value associated with a token. /// /// If the token is a known command, contains command ID and TextPtr is /// unused (command spelling can be found with CommandTraits). Otherwise, /// contains the length of the string that starts at TextPtr. unsigned IntVal;
/// Length of the token spelling in comment. Can be 0 for synthenized /// tokens. unsigned Length;
/// Contains text value associated with a token. const char *TextPtr;
public: SourceLocation getLocation() const LLVM_READONLY { return Loc; } void setLocation(SourceLocation SL) { Loc = SL; }
SourceLocation getEndLocation() const LLVM_READONLY { if (Length == 0 || Length == 1) return Loc; return Loc.getLocWithOffset(Length - 1); }
tok::TokenKind getKind() const LLVM_READONLY { return Kind; } void setKind(tok::TokenKind K) { Kind = K; }
bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
unsigned getLength() const LLVM_READONLY { return Length; } void setLength(unsigned L) { Length = L; }
StringRef getText() const LLVM_READONLY { assert(is(tok::text)); return StringRef(TextPtr, IntVal); }
void setText(StringRef Text) { assert(is(tok::text)); TextPtr = Text.data(); IntVal = Text.size(); }
StringRef getUnknownCommandName() const LLVM_READONLY { assert(is(tok::unknown_command)); return StringRef(TextPtr, IntVal); }
void setUnknownCommandName(StringRef Name) { assert(is(tok::unknown_command)); TextPtr = Name.data(); IntVal = Name.size(); }
unsigned getCommandID() const LLVM_READONLY { assert(is(tok::backslash_command) || is(tok::at_command)); return IntVal; }
void setCommandID(unsigned ID) { assert(is(tok::backslash_command) || is(tok::at_command)); IntVal = ID; }
unsigned getVerbatimBlockID() const LLVM_READONLY { assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); return IntVal; }
void setVerbatimBlockID(unsigned ID) { assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); IntVal = ID; }
StringRef getVerbatimBlockText() const LLVM_READONLY { assert(is(tok::verbatim_block_line)); return StringRef(TextPtr, IntVal); }
void setVerbatimBlockText(StringRef Text) { assert(is(tok::verbatim_block_line)); TextPtr = Text.data(); IntVal = Text.size(); }
unsigned getVerbatimLineID() const LLVM_READONLY { assert(is(tok::verbatim_line_name)); return IntVal; }
void setVerbatimLineID(unsigned ID) { assert(is(tok::verbatim_line_name)); IntVal = ID; }
StringRef getVerbatimLineText() const LLVM_READONLY { assert(is(tok::verbatim_line_text)); return StringRef(TextPtr, IntVal); }
void setVerbatimLineText(StringRef Text) { assert(is(tok::verbatim_line_text)); TextPtr = Text.data(); IntVal = Text.size(); }
StringRef getHTMLTagStartName() const LLVM_READONLY { assert(is(tok::html_start_tag)); return StringRef(TextPtr, IntVal); }
void setHTMLTagStartName(StringRef Name) { assert(is(tok::html_start_tag)); TextPtr = Name.data(); IntVal = Name.size(); }
StringRef getHTMLIdent() const LLVM_READONLY { assert(is(tok::html_ident)); return StringRef(TextPtr, IntVal); }
void setHTMLIdent(StringRef Name) { assert(is(tok::html_ident)); TextPtr = Name.data(); IntVal = Name.size(); }
StringRef getHTMLQuotedString() const LLVM_READONLY { assert(is(tok::html_quoted_string)); return StringRef(TextPtr, IntVal); }
void setHTMLQuotedString(StringRef Str) { assert(is(tok::html_quoted_string)); TextPtr = Str.data(); IntVal = Str.size(); }
StringRef getHTMLTagEndName() const LLVM_READONLY { assert(is(tok::html_end_tag)); return StringRef(TextPtr, IntVal); }
void setHTMLTagEndName(StringRef Name) { assert(is(tok::html_end_tag)); TextPtr = Name.data(); IntVal = Name.size(); }
void dump(const Lexer &L, const SourceManager &SM) const; };
/// Comment lexer. class Lexer { private: Lexer(const Lexer &) = delete; void operator=(const Lexer &) = delete;
/// Allocator for strings that are semantic values of tokens and have to be /// computed (for example, resolved decimal character references). llvm::BumpPtrAllocator &Allocator;
DiagnosticsEngine &Diags;
const CommandTraits &Traits;
const char *const BufferStart; const char *const BufferEnd;
const char *BufferPtr;
/// One past end pointer for the current comment. For BCPL comments points /// to newline or BufferEnd, for C comments points to star in '*/'. const char *CommentEnd;
SourceLocation FileLoc;
/// If true, the commands, html tags, etc will be parsed and reported as /// separate tokens inside the comment body. If false, the comment text will /// be parsed into text and newline tokens. bool ParseCommands;
enum LexerCommentState : uint8_t { LCS_BeforeComment, LCS_InsideBCPLComment, LCS_InsideCComment, LCS_BetweenComments };
/// Low-level lexer state, track if we are inside or outside of comment. LexerCommentState CommentState;
enum LexerState : uint8_t { /// Lexing normal comment text LS_Normal,
/// Finished lexing verbatim block beginning command, will lex first body /// line. LS_VerbatimBlockFirstLine,
/// Lexing verbatim block body line-by-line, skipping line-starting /// decorations. LS_VerbatimBlockBody,
/// Finished lexing verbatim line beginning command, will lex text (one /// line). LS_VerbatimLineText,
/// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. LS_HTMLStartTag,
/// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. LS_HTMLEndTag };
/// Current lexing mode. LexerState State;
/// If State is LS_VerbatimBlock, contains the name of verbatim end /// command, including command marker. SmallString<16> VerbatimBlockEndCommandName;
/// Given a character reference name (e.g., "lt"), return the character that /// it stands for (e.g., "<"). StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
/// Given a Unicode codepoint as base-10 integer, return the character. StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
/// Given a Unicode codepoint as base-16 integer, return the character. StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
void formTokenWithChars(Token &Result, const char *TokEnd, tok::TokenKind Kind);
void formTextToken(Token &Result, const char *TokEnd) { StringRef Text(BufferPtr, TokEnd - BufferPtr); formTokenWithChars(Result, TokEnd, tok::text); Result.setText(Text); }
SourceLocation getSourceLocation(const char *Loc) const { assert(Loc >= BufferStart && Loc <= BufferEnd && "Location out of range for this buffer!");
const unsigned CharNo = Loc - BufferStart; return FileLoc.getLocWithOffset(CharNo); }
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { return Diags.Report(Loc, DiagID); }
/// Eat string matching regexp \code \s*\* \endcode. void skipLineStartingDecorations();
/// Skip over pure text. const char *skipTextToken();
/// Lex comment text, including commands if ParseCommands is set to true. void lexCommentText(Token &T);
void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, const CommandInfo *Info);
void lexVerbatimBlockFirstLine(Token &T);
void lexVerbatimBlockBody(Token &T);
void setupAndLexVerbatimLine(Token &T, const char *TextBegin, const CommandInfo *Info);
void lexVerbatimLineText(Token &T);
void lexHTMLCharacterReference(Token &T);
void setupAndLexHTMLStartTag(Token &T);
void lexHTMLStartTag(Token &T);
void setupAndLexHTMLEndTag(Token &T);
void lexHTMLEndTag(Token &T);
public: Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, const CommandTraits &Traits, SourceLocation FileLoc, const char *BufferStart, const char *BufferEnd, bool ParseCommands = true);
void lex(Token &T);
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const; };
} // end namespace comments } // end namespace clang
#endif
|