diff --git a/Cargo.lock b/Cargo.lock index 8aa66181a6ace2d118ed659a0cc6bbf23b4187da..54087f71335425f62b82721c9e822f2c90c7156e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -719,11 +719,11 @@ checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "git2" -version = "0.16.1" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf7f68c2995f392c49fffb4f95ae2c873297830eb25c6bc4c114ce8f4562acc" +checksum = "232e6a7bfe35766bf715e55a88b39a700596c0ccfd88cd3680b4cdb40d66ef70" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.5.0", "libc", "libgit2-sys", "log", @@ -1086,6 +1086,21 @@ dependencies = [ "strum_macros", ] +[[package]] +name = "hyperast_query" +version = "0.1.0" +dependencies = [ + "clap 4.5.4", + "dotenv", + "lazy_static", + "log", + "pull_request_sanitizer", + "rust_utils", + "serde", + "serde_derive", + "serde_json", +] + [[package]] name = "iana-time-zone" version = "0.1.60" @@ -1273,9 +1288,9 @@ checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "libgit2-sys" -version = "0.14.2+1.5.1" +version = "0.16.2+1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f3d95f6b51075fe9810a7ae22c7095f12b98005ab364d8544797a825ce946a4" +checksum = "ee4126d8b4ee5c9d9ea891dd875cfdc1e9d0950437179104b183d7d8a74d24e8" dependencies = [ "cc", "libc", @@ -1287,9 +1302,9 @@ dependencies = [ [[package]] name = "libssh2-sys" -version = "0.2.23" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b094a36eb4b8b8c8a7b4b8ae43b2944502be3e59cd87687595cf6b0a71b3f4ca" +checksum = "2dc8a030b787e2119a731f1951d6a773e2280c660f8ec4b0f5e1505a386e71ee" dependencies = [ "cc", "libc", diff --git a/Cargo.toml b/Cargo.toml index 25770fd85062eff0b5743d3c0efb451c9e7e4d3b..f98bf032d196d1aef36da19b3cf9ad6a3ccf664e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,8 @@ members = [ "extract_edition_script", "rust_utils", "pull_request_sanitizer", - "github_requester" + "github_requester", + "hyperast_query" ] [workspace.dependencies] @@ -13,14 +14,16 @@ plot_helper = { git = "https://github.com/fitz35/data_analyze.git", rev = "96d97 hyper_ast = { path = "../HyperAST/hyper_ast"} hyper_diff = { path = "../HyperAST/hyper_diff"} hyper_ast_cvs_git = { path = "../HyperAST/cvs/git"} +hyper_ast_gen_ts_tsquery = { path = "../HyperAST/gen/tree-sitter"} clap = { version = "4.2.5", features = ["derive"] } -git2 = { version = "0.16.1", features = ["vendored-libgit2", "vendored-openssl"] } +git2 = { version = "0.18.2", features = ["vendored-libgit2", "vendored-openssl"] } +lazy_static = "1.4.0" serde = "1.0.159" serde_derive = "1.0.159" serde_json = "1.0.94" diff --git a/extract_edition_script/src/main.rs b/extract_edition_script/src/main.rs index 5c118429ffb688ee7966ac703ffbaa256fa8105c..af8baaeace9f0323be7e5fd3acf84ebb09f531bc 100644 --- a/extract_edition_script/src/main.rs +++ b/extract_edition_script/src/main.rs @@ -1,6 +1,7 @@ use std::fs; use log::{info, error}; +use pull_request_sanitizer::load_sanitized_output_cve_data; use pull_request_sanitizer::pipeline::SanitizedOutputCveData; use rust_utils::logger::common_logger::init_logger; @@ -16,37 +17,8 @@ fn main() -> Result<(), Box<dyn std::error::Error>> { let argv = get_program_args(); info!("🚀 Start extraction."); - let entries = fs::read_dir(&argv.dataset_path).unwrap(); + let cve_datas : Vec<SanitizedOutputCveData> = load_sanitized_output_cve_data(&argv.dataset_path)?; - let mut cve_datas : Vec<SanitizedOutputCveData> = Vec::new(); - - for entry in entries { - let path = match entry { - Ok(entry) => entry.path(), - Err(_) => continue, // Skip to next iteration if entry is an error - }; - - if !path.is_dir() { - continue; // Skip to next iteration if path is not a directory - } - - let data_path = path.join("data.json"); - if !data_path.exists() { - continue; // Skip to next iteration if data.json does not exist - } - - let contents =fs::read_to_string(&data_path)?; - - let data = match serde_json::from_str(&contents) { - Ok(data) => data, - Err(_) => panic!("Failed to parse data.json"), - }; - - cve_datas.push(data); - } - - - cve_datas.sort_by(|a, b| a.get_cve_id().cmp(&b.get_cve_id())); for cve_metadata in cve_datas.into_iter() { let id = cve_metadata.get_cve_id().to_string(); diff --git a/hyperast_query/Cargo.toml b/hyperast_query/Cargo.toml new file mode 100644 index 0000000000000000000000000000000000000000..739dde92542c18393282d8155176b123cb5cbbd7 --- /dev/null +++ b/hyperast_query/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "hyperast_query" +version = "0.1.0" +edition = "2021" + + + +[dependencies] +rust_utils = { path = "../rust_utils" } +pull_request_sanitizer = { path = "../pull_request_sanitizer" } + +lazy_static = {workspace = true} + +log = {workspace = true} + +dotenv = {workspace = true} + +clap = {workspace = true} + +serde = {workspace = true} +serde_derive = {workspace = true} +serde_json = {workspace = true} \ No newline at end of file diff --git a/hyperast_query/src/main.rs b/hyperast_query/src/main.rs new file mode 100644 index 0000000000000000000000000000000000000000..84b5f815cfe364eaf1ac166622f60853a48fd038 --- /dev/null +++ b/hyperast_query/src/main.rs @@ -0,0 +1,28 @@ + +use log::info; +use pull_request_sanitizer::load_sanitized_output_cve_data; +use pull_request_sanitizer::pipeline::SanitizedOutputCveData; +use rust_utils::logger::common_logger::init_logger; + +use crate::params::argv::get_program_args; + +mod params; + + + +fn main() -> Result<(), Box<dyn std::error::Error>> { + dotenv::dotenv().ok(); + init_logger(); + let argv = get_program_args(); + info!("🚀 Start testing the hyperast query."); + + + let cve_datas : Vec<SanitizedOutputCveData> = load_sanitized_output_cve_data(&argv.dataset_path)?; + + for cve_data in cve_datas { + + } + + + Ok(()) +} \ No newline at end of file diff --git a/hyperast_query/src/params/argv.rs b/hyperast_query/src/params/argv.rs new file mode 100644 index 0000000000000000000000000000000000000000..8cdd89e5229ae75399b65a0e5af006e6f6a9fb17 --- /dev/null +++ b/hyperast_query/src/params/argv.rs @@ -0,0 +1,16 @@ +use clap::Parser; + +/// Benchmark the query on the hyper ast +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +pub struct Argv { + /// the dataset path to use + #[arg(short, long)] + pub dataset_path : String, +} + + + +pub fn get_program_args() -> Argv { + return Argv::parse(); +} \ No newline at end of file diff --git a/hyperast_query/src/params/mod.rs b/hyperast_query/src/params/mod.rs new file mode 100644 index 0000000000000000000000000000000000000000..b0be02939b5f455dd7034180e0063be4af3c0611 --- /dev/null +++ b/hyperast_query/src/params/mod.rs @@ -0,0 +1,3 @@ +pub mod argv; + + diff --git a/pull_request_sanitizer/src/lib.rs b/pull_request_sanitizer/src/lib.rs index 578a1fb594d5fcdd2398e55c4e22721d42741920..6229cca93145f279d9ac93133e602e658c34e7c6 100644 --- a/pull_request_sanitizer/src/lib.rs +++ b/pull_request_sanitizer/src/lib.rs @@ -1,5 +1,45 @@ +use std::fs; +use std::path::Path; + +use pipeline::SanitizedOutputCveData; + mod params; pub mod pipeline; mod errors; mod utils; -mod repo; \ No newline at end of file +mod repo; + +pub fn load_sanitized_output_cve_data<P : AsRef<Path>>(cve_dir_path : P) -> Result<Vec<SanitizedOutputCveData>, Box<dyn std::error::Error>> { + let entries = fs::read_dir(&cve_dir_path).unwrap(); + + let mut cve_datas : Vec<SanitizedOutputCveData> = Vec::new(); + + for entry in entries { + let path = match entry { + Ok(entry) => entry.path(), + Err(_) => continue, // Skip to next iteration if entry is an error + }; + + if !path.is_dir() { + continue; // Skip to next iteration if path is not a directory + } + + let data_path = path.join("data.json"); + if !data_path.exists() { + continue; // Skip to next iteration if data.json does not exist + } + + let contents =fs::read_to_string(&data_path)?; + + let data = match serde_json::from_str(&contents) { + Ok(data) => data, + Err(_) => panic!("Failed to parse data.json"), + }; + + cve_datas.push(data); + } + + cve_datas.sort_by(|a, b| a.get_cve_id().cmp(&b.get_cve_id())); + + Ok(cve_datas) +} \ No newline at end of file diff --git a/pull_request_sanitizer/src/main.rs b/pull_request_sanitizer/src/main.rs index ee54a07721cbdaafbd3dfadfd95eaa3c350ece92..69fda20fdff94e60f7eb0e3dbd7fce43fb7973af 100644 --- a/pull_request_sanitizer/src/main.rs +++ b/pull_request_sanitizer/src/main.rs @@ -43,7 +43,7 @@ fn main() { continue; } - match pipeline::sanitize_cve(&path) { + match pipeline::sanitize_cve(&path, argv.check_in_repo) { Ok((result, patch_data)) => { fs::create_dir_all(&output_cve_folder).unwrap(); let output_file = output_cve_folder.join("data.json"); diff --git a/pull_request_sanitizer/src/params/argv.rs b/pull_request_sanitizer/src/params/argv.rs index ae9ef4aba077a0ad8ef1e88484b7ec9714bc1671..1fc3a63317f0da4d3175413e91afb443e94586ce 100644 --- a/pull_request_sanitizer/src/params/argv.rs +++ b/pull_request_sanitizer/src/params/argv.rs @@ -15,6 +15,11 @@ pub struct Argv { /// delete the output folder if it already exists #[arg(long, action)] pub delete_output_folder : bool, + + + /// check also in the repo itself (using git2, take a lot of time and space) + #[arg(long, action)] + pub check_in_repo : bool, } diff --git a/pull_request_sanitizer/src/pipeline.rs b/pull_request_sanitizer/src/pipeline.rs index a37099ad41b7d52c77cf0248848cc81c221d0d16..ac627f85f4be7bcd758056354b4d6a01e0f7452b 100644 --- a/pull_request_sanitizer/src/pipeline.rs +++ b/pull_request_sanitizer/src/pipeline.rs @@ -55,7 +55,7 @@ impl SanitizedOutputCveData { /// Sanitize a cve directory (return the sanitized data and the patch data) -pub(crate) fn sanitize_cve<P : AsRef<Path>>(cve_dir_path : P) -> Result<(SanitizedOutputCveData, String), CveSanitizerError> { +pub(crate) fn sanitize_cve<P : AsRef<Path>>(cve_dir_path : P, check_in_repo : bool) -> Result<(SanitizedOutputCveData, String), CveSanitizerError> { let dir_path = cve_dir_path.as_ref(); @@ -86,11 +86,17 @@ pub(crate) fn sanitize_cve<P : AsRef<Path>>(cve_dir_path : P) -> Result<(Sanitiz repo_type, language, warnings - ) = get_and_and_check_repo_name(&pull_request_data, merge_commit_sha.as_str())?; + ) = get_and_and_check_repo_name(&pull_request_data, merge_commit_sha.as_str(), check_in_repo)?; // prepare the new commit in the repo - let repo = load_repo_from_name(&repo_name); - let oid = apply_patch_and_retrieve_commit(&repo, merge_commit_sha.as_str(), patch_data.as_str())?; + let merged_commit_sha = if check_in_repo { + let repo = load_repo_from_name(&repo_name); + let oid = apply_patch_and_retrieve_commit(&repo, merge_commit_sha.as_str(), patch_data.as_str())?; + oid.to_string() + } else { + String::new() + }; + @@ -103,7 +109,7 @@ pub(crate) fn sanitize_cve<P : AsRef<Path>>(cve_dir_path : P) -> Result<(Sanitiz language, warnings, commit_sha : merge_commit_sha.to_string(), - reconstructed_merged_commit_sha : oid.to_string() + reconstructed_merged_commit_sha : merged_commit_sha, }; diff --git a/pull_request_sanitizer/src/repo/mod.rs b/pull_request_sanitizer/src/repo/mod.rs index 2b45276de0a08674909ed7eda1bb4f219ad0adb7..337d0603a26cb60c331a34627f7469dc4ed56e83 100644 --- a/pull_request_sanitizer/src/repo/mod.rs +++ b/pull_request_sanitizer/src/repo/mod.rs @@ -76,7 +76,8 @@ impl RepoType { &self, pull_request_data : &serde_json::Value, commit_to_test : &str, - _main_url : &str + _main_url : &str, + check_in_repo : bool, ) -> Result<(String, Option<Language>), CveSanitizerApiWarning> { // determine the repo name let repo_name = pull_request_data.get_json_value_from_path( @@ -109,9 +110,10 @@ impl RepoType { } // test if the commit is in the cloned repo - let repo = load_repo_from_name(repo_name); - let _commit = retrieve_commit(&repo, commit_to_test).map_err(|_| self.get_merge_commit_not_found_in_cloned_repo_warning())?; - + if check_in_repo { + let repo = load_repo_from_name(repo_name); + let _commit = retrieve_commit(&repo, commit_to_test).map_err(|_| self.get_merge_commit_not_found_in_cloned_repo_warning())?; + } Ok((repo_name.to_string(), Language::new_from_github_language(language))) } } @@ -120,7 +122,8 @@ impl RepoType { /// if the commit is in the 2 repos, it will return the head repo pub fn get_and_and_check_repo_name( pull_request_data : &serde_json::Value, - commit_to_test : &str + commit_to_test : &str, + check_in_repo : bool, ) -> Result<(String, RepoType, Option<Language>, Vec<CveSanitizerApiWarning>), CveSanitizerError> { let mut warnings = vec![]; @@ -133,11 +136,11 @@ pub fn get_and_and_check_repo_name( // ----------- head repo ------------ - let head_repo = RepoType::Head.test_repo(pull_request_data, commit_to_test, main_url); + let head_repo = RepoType::Head.test_repo(pull_request_data, commit_to_test, main_url, check_in_repo); // ----------- base repo ------------ - let base_repo = RepoType::Base.test_repo(pull_request_data, commit_to_test, main_url); + let base_repo = RepoType::Base.test_repo(pull_request_data, commit_to_test, main_url, check_in_repo); // collect the results if head_repo.is_err() { diff --git a/rust_utils/src/git_utils/mod.rs b/rust_utils/src/git_utils/mod.rs index 731d9e571f3daba136c15167daf82143860cb879..8ae1e8f4dbfe51d88c24672f1b99498f5b0deb1d 100644 --- a/rust_utils/src/git_utils/mod.rs +++ b/rust_utils/src/git_utils/mod.rs @@ -23,7 +23,9 @@ pub fn apply_patch_and_retrieve_commit<'a>( repository.set_index(&mut new_index)?; // set the new index - let new_tree = new_index.write_tree()?; // write the new tree + let new_tree_oid = new_index.write_tree()?; // write the new tree + + let new_tree = repository.find_tree(new_tree_oid)?; // find the new tree let signature = Signature::now("sanitized cve automatique", "clement.lahoche@inria.fr")?; // create a signature