Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions shared/tree-sitter-extractor/src/extractor/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,9 @@ pub fn extract(
.run_from_tree(&tree, source)
.unwrap_or_else(|e| panic!("Desugaring failed for {path_str}: {e}"));
traverse_yeast(&ast, &mut visitor);
// Comments and other `extra` nodes are not represented in the desugared
// AST, so recover them directly from the original parse tree.
traverse_extras(&tree, &mut visitor);
} else {
traverse(&tree, &mut visitor);
}
Expand Down Expand Up @@ -365,6 +368,8 @@ struct Visitor<'a> {
ast_node_parent_table_name: String,
/// Language-specific name of the tokeninfo table
tokeninfo_table_name: String,
/// Language-specific name of the trivia tokeninfo table
trivia_tokeninfo_table_name: String,
/// A lookup table from type name to node types
schema: &'a NodeTypeMap,
/// A stack for gathering information from child nodes. Whenever a node is
Expand Down Expand Up @@ -395,11 +400,33 @@ impl<'a> Visitor<'a> {
ast_node_location_table_name: format!("{language_prefix}_ast_node_location"),
ast_node_parent_table_name: format!("{language_prefix}_ast_node_parent"),
tokeninfo_table_name: format!("{language_prefix}_tokeninfo"),
trivia_tokeninfo_table_name: format!("{language_prefix}_trivia_tokeninfo"),
schema,
stack: Vec::new(),
}
}

/// Emits a `TriviaToken` for the given `extra` node (e.g. a comment) from
/// the original parse tree. Trivia tokens carry a location and their source
/// text, but are not attached to a parent in the (possibly desugared) AST.
fn emit_trivia_token(&mut self, node: &Node) {
let id = self.trap_writer.fresh_id();
let loc = location_for(self, self.file_label, node);
let loc_label = location_label(self.trap_writer, loc);
self.trap_writer.add_tuple(
&self.ast_node_location_table_name,
vec![trap::Arg::Label(id), trap::Arg::Label(loc_label)],
);
self.trap_writer.add_tuple(
&self.trivia_tokeninfo_table_name,
vec![
trap::Arg::Label(id),
trap::Arg::Int(node.kind_id() as usize),
sliced_source_arg(self.source, node),
],
);
}

fn record_parse_error(&mut self, loc: trap::Label, mesg: &diagnostics::DiagnosticMessage) {
self.diagnostics_writer.write(mesg);
let id = self.trap_writer.fresh_id();
Expand Down Expand Up @@ -835,6 +862,24 @@ fn traverse(tree: &Tree, visitor: &mut Visitor) {
}
}

/// Walks the original tree-sitter tree and emits a `TriviaToken` for every
/// `extra` node (e.g. a comment). Used to preserve comments that would
/// otherwise be lost after a desugaring pass rewrites the tree.
fn traverse_extras(tree: &Tree, visitor: &mut Visitor) {
emit_extras_in(visitor, tree.root_node());
}

fn emit_extras_in(visitor: &mut Visitor, node: Node<'_>) {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.is_extra() {
visitor.emit_trivia_token(&child);
} else {
emit_extras_in(visitor, child);
}
}
}

fn traverse_yeast(tree: &yeast::Ast, visitor: &mut Visitor) {
use yeast::Cursor;
let mut cursor = tree.walk();
Expand Down
56 changes: 37 additions & 19 deletions shared/tree-sitter-extractor/src/generator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,12 @@ pub fn generate(
let node_parent_table_name = format!("{}_ast_node_parent", &prefix);
let token_name = format!("{}_token", &prefix);
let tokeninfo_name = format!("{}_tokeninfo", &prefix);
let trivia_token_name = format!("{}_trivia_token", &prefix);
let trivia_tokeninfo_name = format!("{}_trivia_tokeninfo", &prefix);
let reserved_word_name = format!("{}_reserved_word", &prefix);
// When a desugaring is configured, comments and other `extra` nodes are
// preserved from the original parse tree as `TriviaToken`s.
let has_trivia_tokens = language.desugar.is_some();
let effective_node_types: String = match language
.desugar
.as_ref()
Expand All @@ -85,28 +90,35 @@ pub fn generate(
let nodes = node_types::read_node_types_str(&prefix, &effective_node_types)?;
let (dbscheme_entries, mut ast_node_members, token_kinds) = convert_nodes(&nodes);
ast_node_members.insert(&token_name);
if has_trivia_tokens {
ast_node_members.insert(&trivia_token_name);
}
writeln!(&mut dbscheme_writer, "/*- {} dbscheme -*/", language.name)?;
dbscheme::write(&mut dbscheme_writer, &dbscheme_entries)?;
let token_case = create_token_case(&token_name, token_kinds);
dbscheme::write(
&mut dbscheme_writer,
&[
dbscheme::Entry::Table(create_tokeninfo(&tokeninfo_name, &token_name)),
dbscheme::Entry::Case(token_case),
dbscheme::Entry::Union(dbscheme::Union {
name: &ast_node_name,
members: ast_node_members,
}),
dbscheme::Entry::Table(create_ast_node_location_table(
&node_location_table_name,
&ast_node_name,
)),
dbscheme::Entry::Table(create_ast_node_parent_table(
&node_parent_table_name,
&ast_node_name,
)),
],
)?;
let mut dbscheme_tail = vec![
dbscheme::Entry::Table(create_tokeninfo(&tokeninfo_name, &token_name)),
dbscheme::Entry::Case(token_case),
];
if has_trivia_tokens {
dbscheme_tail.push(dbscheme::Entry::Table(create_tokeninfo(
&trivia_tokeninfo_name,
&trivia_token_name,
)));
}
dbscheme_tail.push(dbscheme::Entry::Union(dbscheme::Union {
name: &ast_node_name,
members: ast_node_members,
}));
dbscheme_tail.push(dbscheme::Entry::Table(create_ast_node_location_table(
&node_location_table_name,
&ast_node_name,
)));
dbscheme_tail.push(dbscheme::Entry::Table(create_ast_node_parent_table(
&node_parent_table_name,
&ast_node_name,
)));
dbscheme::write(&mut dbscheme_writer, &dbscheme_tail)?;

let mut body = vec![
ql::TopLevel::Class(ql_gen::create_ast_node_class(
Expand All @@ -116,6 +128,12 @@ pub fn generate(
)),
ql::TopLevel::Class(ql_gen::create_token_class(&token_name, &tokeninfo_name)),
];
if has_trivia_tokens {
body.push(ql::TopLevel::Class(ql_gen::create_trivia_token_class(
&trivia_token_name,
&trivia_tokeninfo_name,
)));
}
// Only emit the ReservedWord class when there are actually unnamed token
// types in the schema (i.e., @{prefix}_reserved_word exists in the dbscheme).
// When converting from a YEAST YAML schema that has no unnamed tokens, this
Expand Down
64 changes: 64 additions & 0 deletions shared/tree-sitter-extractor/src/generator/ql_gen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,70 @@ pub fn create_token_class<'a>(token_type: &'a str, tokeninfo: &'a str) -> ql::Cl
}
}

/// Creates the `TriviaToken` class. Trivia tokens (e.g. comments) are
/// `extra` nodes preserved from the original parse tree even when the tree has
/// been rewritten by a desugaring pass. They are not part of the regular
/// `Token` hierarchy because they do not appear in the (possibly desugared)
/// output schema.
pub fn create_trivia_token_class<'a>(
trivia_token_type: &'a str,
trivia_tokeninfo: &'a str,
) -> ql::Class<'a> {
let trivia_tokeninfo_arity = 3; // id, kind, value
let get_value = ql::Predicate {
qldoc: Some(String::from("Gets the source text of this trivia token.")),
name: "getValue",
overridden: false,
is_private: false,
is_final: true,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: create_get_field_expr_for_column_storage(
"result",
trivia_tokeninfo,
1,
trivia_tokeninfo_arity,
),
overlay: None,
};
let to_string = ql::Predicate {
qldoc: Some(String::from(
"Gets a string representation of this element.",
)),
name: "toString",
overridden: true,
is_private: false,
is_final: true,
return_type: Some(ql::Type::String),
formal_parameters: vec![],
body: ql::Expression::Equals(
Box::new(ql::Expression::Var("result")),
Box::new(ql::Expression::Dot(
Box::new(ql::Expression::Var("this")),
"getValue",
vec![],
)),
),
overlay: None,
};
ql::Class {
qldoc: Some(String::from(
"A trivia token, such as a comment, preserved from the original parse tree.",
)),
name: "TriviaToken",
is_abstract: false,
supertypes: vec![ql::Type::At(trivia_token_type), ql::Type::Normal("AstNode")]
.into_iter()
.collect(),
characteristic_predicate: None,
predicates: vec![
get_value,
to_string,
create_get_a_primary_ql_class("TriviaToken", false),
],
}
}

// Creates the `ReservedWord` class.
pub fn create_reserved_word_class(db_name: &str) -> ql::Class<'_> {
let class_name = "ReservedWord";
Expand Down
12 changes: 12 additions & 0 deletions unified/ql/lib/codeql/unified/Ast.qll
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,18 @@ module Unified {
override string getAPrimaryQlClass() { result = "Token" }
}

/** A trivia token, such as a comment, preserved from the original parse tree. */
class TriviaToken extends @unified_trivia_token, AstNode {
/** Gets the source text of this trivia token. */
final string getValue() { unified_trivia_tokeninfo(this, _, result) }

/** Gets a string representation of this element. */
final override string toString() { result = this.getValue() }

/** Gets the name of the primary QL class for this element. */
override string getAPrimaryQlClass() { result = "TriviaToken" }
}

/** Gets the file containing the given `node`. */
private @file getNodeFile(@unified_ast_node node) {
exists(@location_default loc | unified_ast_node_location(node, loc) |
Expand Down
18 changes: 18 additions & 0 deletions unified/ql/lib/codeql/unified/Comments.qll
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/** Provides classes for working with comments. */

private import unified

/**
* A comment appearing in the source code.
*/
class Comment extends TriviaToken {
// At the moment, comments are the only type trivia token we extract
/**
* Gets the text inside this comment, not counting the delimeters.
*/
string getCommentText() {
result = this.getValue().regexpCapture("//(.*)", 1)
or
result = this.getValue().regexpCapture("(?s)/\\*(.*)\\*/", 1)
}
}
8 changes: 7 additions & 1 deletion unified/ql/lib/unified.dbscheme
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,13 @@ case @unified_token.kind of
;


@unified_ast_node = @unified_apply_pattern | @unified_binary_expr | @unified_block_stmt | @unified_call_expr | @unified_expr_condition | @unified_expr_stmt | @unified_guard_if_stmt | @unified_if_stmt | @unified_lambda_expr | @unified_let_pattern_condition | @unified_member_access_expr | @unified_name_expr | @unified_parameter | @unified_sequence_condition | @unified_token | @unified_top_level | @unified_tuple_pattern | @unified_unary_expr | @unified_var_pattern | @unified_variable_declaration_stmt | @unified_variable_declarator
unified_trivia_tokeninfo(
unique int id: @unified_trivia_token,
int kind: int ref,
string value: string ref
);

@unified_ast_node = @unified_apply_pattern | @unified_binary_expr | @unified_block_stmt | @unified_call_expr | @unified_expr_condition | @unified_expr_stmt | @unified_guard_if_stmt | @unified_if_stmt | @unified_lambda_expr | @unified_let_pattern_condition | @unified_member_access_expr | @unified_name_expr | @unified_parameter | @unified_sequence_condition | @unified_token | @unified_top_level | @unified_trivia_token | @unified_tuple_pattern | @unified_unary_expr | @unified_var_pattern | @unified_variable_declaration_stmt | @unified_variable_declarator

unified_ast_node_location(
unique int node: @unified_ast_node ref,
Expand Down
8 changes: 8 additions & 0 deletions unified/ql/lib/unified.qll
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/**
* Provides classes for working with the AST, as well as files and locations.
*/

import codeql.Locations
import codeql.files.FileSystem
import codeql.unified.Ast::Unified
import codeql.unified.Comments
3 changes: 3 additions & 0 deletions unified/ql/test/library-tests/comments/comments.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
| comments.swift:1:1:1:22 | // Hello this is swift | Hello this is swift |
Comment thread
asgerf marked this conversation as resolved.
| comments.swift:3:1:6:3 | /*\n * This is a multi-line comment\n * It should be ignored by the parser\n */ | \n * This is a multi-line comment\n * It should be ignored by the parser\n |
| comments.swift:9:5:9:36 | // This is a single-line comment | This is a single-line comment |
3 changes: 3 additions & 0 deletions unified/ql/test/library-tests/comments/comments.ql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import unified

query predicate comments(Comment c, string text) { text = c.getCommentText() }
11 changes: 11 additions & 0 deletions unified/ql/test/library-tests/comments/comments.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Hello this is swift

/*
* This is a multi-line comment
* It should be ignored by the parser
*/

func hello() {
// This is a single-line comment
print("Hello, world!")
}
Loading