Skip to content

More lexer improvements #102302

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Sep 28, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
345 changes: 174 additions & 171 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,182 @@ impl<'a> StringReader<'a> {

debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));

match self.cook_lexer_token(token.kind, start) {
Some(kind) => {
// Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
// rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
// additional validation.
let kind = match token.kind {
rustc_lexer::TokenKind::LineComment { doc_style } => {
// Skip non-doc comments
let Some(doc_style) = doc_style else {
self.lint_unicode_text_flow(start);
preceded_by_whitespace = true;
continue;
};

// Opening delimiter of the length 3 is not included into the symbol.
let content_start = start + BytePos(3);
let content = self.str_from(content_start);
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
}
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
if !terminated {
self.report_unterminated_block_comment(start, doc_style);
}

// Skip non-doc comments
let Some(doc_style) = doc_style else {
self.lint_unicode_text_flow(start);
preceded_by_whitespace = true;
continue;
};

// Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
let content_start = start + BytePos(3);
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
let content = self.str_from_to(content_start, content_end);
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
}
rustc_lexer::TokenKind::Whitespace => {
preceded_by_whitespace = true;
continue;
}
rustc_lexer::TokenKind::Ident => {
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
return (Token::new(kind, span), preceded_by_whitespace);
self.sess.symbol_gallery.insert(sym, span);
token::Ident(sym, false)
}
None => preceded_by_whitespace = true,
}
rustc_lexer::TokenKind::RawIdent => {
let sym = nfc_normalize(self.str_from(start + BytePos(2)));
let span = self.mk_sp(start, self.pos);
self.sess.symbol_gallery.insert(sym, span);
if !sym.can_be_raw() {
self.err_span(span, &format!("`{}` cannot be a raw identifier", sym));
}
self.sess.raw_identifier_spans.borrow_mut().push(span);
token::Ident(sym, true)
}
rustc_lexer::TokenKind::UnknownPrefix => {
self.report_unknown_prefix(start);
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.symbol_gallery.insert(sym, span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::InvalidIdent
// Do not recover an identifier with emoji if the codepoint is a confusable
// with a recoverable substitution token, like `➖`.
if !UNICODE_ARRAY
.iter()
.any(|&(c, _, _)| {
let sym = self.str_from(start);
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
}) =>
{
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default()
.push(span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
let suffix = if suffix_start < self.pos {
let string = self.str_from(suffix_start);
if string == "_" {
self.sess
.span_diagnostic
.struct_span_warn(
self.mk_sp(suffix_start, self.pos),
"underscore literal suffix is not allowed",
)
.warn(
"this was previously accepted by the compiler but is \
being phased out; it will become a hard error in \
a future release!",
)
.note(
"see issue #42326 \
<https://github.com/rust-lang/rust/issues/42326> \
for more information",
)
.emit();
None
} else {
Some(Symbol::intern(string))
}
} else {
None
};
token::Literal(token::Lit { kind, symbol, suffix })
}
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
// Include the leading `'` in the real identifier, for macro
// expansion purposes. See #12512 for the gory details of why
// this is necessary.
let lifetime_name = self.str_from(start);
if starts_with_number {
self.err_span_(start, self.pos, "lifetimes cannot start with a number");
}
let ident = Symbol::intern(lifetime_name);
token::Lifetime(ident)
}
rustc_lexer::TokenKind::Semi => token::Semi,
rustc_lexer::TokenKind::Comma => token::Comma,
rustc_lexer::TokenKind::Dot => token::Dot,
rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
rustc_lexer::TokenKind::At => token::At,
rustc_lexer::TokenKind::Pound => token::Pound,
rustc_lexer::TokenKind::Tilde => token::Tilde,
rustc_lexer::TokenKind::Question => token::Question,
rustc_lexer::TokenKind::Colon => token::Colon,
rustc_lexer::TokenKind::Dollar => token::Dollar,
rustc_lexer::TokenKind::Eq => token::Eq,
rustc_lexer::TokenKind::Bang => token::Not,
rustc_lexer::TokenKind::Lt => token::Lt,
rustc_lexer::TokenKind::Gt => token::Gt,
rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
rustc_lexer::TokenKind::And => token::BinOp(token::And),
rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),

rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
let c = self.str_from(start).chars().next().unwrap();
let mut err =
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
// FIXME: the lexer could be used to turn the ASCII version of unicode
// homoglyphs, instead of keeping a table in `check_for_substitution`into the
// token. Ideally, this should be inside `rustc_lexer`. However, we should
// first remove compound tokens like `<<` from `rustc_lexer`, and then add
// fancier error recovery to it, as there will be less overall work to do this
// way.
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
if c == '\x00' {
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
}
err.emit();
if let Some(token) = token {
token
} else {
preceded_by_whitespace = true;
continue;
}
}
rustc_lexer::TokenKind::Eof => token::Eof,
};
let span = self.mk_sp(start, self.pos);
return (Token::new(kind, span), preceded_by_whitespace);
}
}

Expand Down Expand Up @@ -158,172 +327,6 @@ impl<'a> StringReader<'a> {
}
}

/// Turns simple `rustc_lexer::TokenKind` enum into a rich
/// `rustc_ast::TokenKind`. This turns strings into interned
/// symbols and runs additional validation.
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option<TokenKind> {
Some(match token {
rustc_lexer::TokenKind::LineComment { doc_style } => {
// Skip non-doc comments
let Some(doc_style) = doc_style else {
self.lint_unicode_text_flow(start);
return None;
};

// Opening delimiter of the length 3 is not included into the symbol.
let content_start = start + BytePos(3);
let content = self.str_from(content_start);
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
}
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
if !terminated {
self.report_unterminated_block_comment(start, doc_style);
}

// Skip non-doc comments
let Some(doc_style) = doc_style else {
self.lint_unicode_text_flow(start);
return None;
};

// Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
let content_start = start + BytePos(3);
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
let content = self.str_from_to(content_start, content_end);
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
}
rustc_lexer::TokenKind::Whitespace => return None,
rustc_lexer::TokenKind::Ident => {
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.symbol_gallery.insert(sym, span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::RawIdent => {
let sym = nfc_normalize(self.str_from(start + BytePos(2)));
let span = self.mk_sp(start, self.pos);
self.sess.symbol_gallery.insert(sym, span);
if !sym.can_be_raw() {
self.err_span(span, &format!("`{}` cannot be a raw identifier", sym));
}
self.sess.raw_identifier_spans.borrow_mut().push(span);
token::Ident(sym, true)
}
rustc_lexer::TokenKind::UnknownPrefix => {
self.report_unknown_prefix(start);
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.symbol_gallery.insert(sym, span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::InvalidIdent
// Do not recover an identifier with emoji if the codepoint is a confusable
// with a recoverable substitution token, like `➖`.
if !UNICODE_ARRAY
.iter()
.any(|&(c, _, _)| {
let sym = self.str_from(start);
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
})
=>
{
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
let suffix = if suffix_start < self.pos {
let string = self.str_from(suffix_start);
if string == "_" {
self.sess
.span_diagnostic
.struct_span_warn(
self.mk_sp(suffix_start, self.pos),
"underscore literal suffix is not allowed",
)
.warn(
"this was previously accepted by the compiler but is \
being phased out; it will become a hard error in \
a future release!",
)
.note(
"see issue #42326 \
<https://github.com/rust-lang/rust/issues/42326> \
for more information",
)
.emit();
None
} else {
Some(Symbol::intern(string))
}
} else {
None
};
token::Literal(token::Lit { kind, symbol, suffix })
}
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
// Include the leading `'` in the real identifier, for macro
// expansion purposes. See #12512 for the gory details of why
// this is necessary.
let lifetime_name = self.str_from(start);
if starts_with_number {
self.err_span_(start, self.pos, "lifetimes cannot start with a number");
}
let ident = Symbol::intern(lifetime_name);
token::Lifetime(ident)
}
rustc_lexer::TokenKind::Semi => token::Semi,
rustc_lexer::TokenKind::Comma => token::Comma,
rustc_lexer::TokenKind::Dot => token::Dot,
rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
rustc_lexer::TokenKind::At => token::At,
rustc_lexer::TokenKind::Pound => token::Pound,
rustc_lexer::TokenKind::Tilde => token::Tilde,
rustc_lexer::TokenKind::Question => token::Question,
rustc_lexer::TokenKind::Colon => token::Colon,
rustc_lexer::TokenKind::Dollar => token::Dollar,
rustc_lexer::TokenKind::Eq => token::Eq,
rustc_lexer::TokenKind::Bang => token::Not,
rustc_lexer::TokenKind::Lt => token::Lt,
rustc_lexer::TokenKind::Gt => token::Gt,
rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
rustc_lexer::TokenKind::And => token::BinOp(token::And),
rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),

rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
let c = self.str_from(start).chars().next().unwrap();
let mut err =
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
// instead of keeping a table in `check_for_substitution`into the token. Ideally,
// this should be inside `rustc_lexer`. However, we should first remove compound
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
// as there will be less overall work to do this way.
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
if c == '\x00' {
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
}
err.emit();
token?
}
rustc_lexer::TokenKind::Eof => token::Eof,
})
}

fn cook_doc_comment(
&self,
content_start: BytePos,
Expand Down