diff --git a/src/libexpr/primops/fetchTree.cc b/src/libexpr/primops/fetchTree.cc index b8b99d4fa8d..df6f6ad6a8e 100644 --- a/src/libexpr/primops/fetchTree.cc +++ b/src/libexpr/primops/fetchTree.cc @@ -306,6 +306,23 @@ static RegisterPrimOp primop_fetchGit({ - url\ The URL of the repo. + The URL can be a path value, in order to fetch from a local + repository. When `ref` and `rev` unset, the default behavior is to + produce an output that includes the uncommitted changes to files known + to git. This is similar to `HEAD` after running `git add -u` and + `git commit`. Files that have not been added to the index or `HEAD` + will be ignored. + + This will not run the git smudge filters, but changed files will be + processed with the git clean filter, in order to be consistent with + remote repositories. This way, decrypted git-crypt secrets are not + added to the store. Warning: the current implementation does not + apply this logic to submodules. + + Tip: if you prefer to keep your index clean, you can use `git add -N` + to add files to the index without adding their contents. You can add + the contents later with `git add -p` or `git add`. + - name\ The name of the directory the repo should be exported to in the store. Defaults to the basename of the URL. @@ -315,11 +332,13 @@ static RegisterPrimOp primop_fetchGit({ - ref\ The git ref to look for the requested revision under. This is - often a branch or tag name. Defaults to `HEAD`. + often a branch or tag name. For remote repositories, this defaults + to `HEAD`. If the `url` is a path value, some local changes are + included; see `url`. By default, the `ref` value is prefixed with `refs/heads/`. As of Nix 2.3.0 Nix will not prefix `refs/heads/` if `ref` starts - with `refs/`. + with `refs/`. `HEAD` is also not prefixed. - submodules\ A Boolean parameter that specifies whether submodules should be diff --git a/src/libfetchers/git.cc b/src/libfetchers/git.cc index d8e0dbe0a4d..dd2a5c90399 100644 --- a/src/libfetchers/git.cc +++ b/src/libfetchers/git.cc @@ -180,6 +180,16 @@ struct GitInputScheme : InputScheme if (submodules) cacheType += "-submodules"; if (allRefs) cacheType += "-all-refs"; + bool isDirtyTree = false; + + // TODO: Recursively use the `write-tree` logic for the submodules. + // For now, we use a diff to get the uncommitted submodule + // changes. This only works correctly in cases where the diff + // does not depend on smudge/clean behavior, which we can't + // assume. The submodule worktree does come from a fresh repo, + // so at least it seems that git-crypt security is not at risk. + std::string dirtySubmoduleDiff; + auto getImmutableAttrs = [&]() { return Attrs({ @@ -197,6 +207,13 @@ struct GitInputScheme : InputScheme if (!shallow) input.attrs.insert_or_assign("revCount", getIntAttr(infoAttrs, "revCount")); input.attrs.insert_or_assign("lastModified", getIntAttr(infoAttrs, "lastModified")); + + // If the tree is dirty, we use a tree hash internally, but we don't + // want to expose it. + if (isDirtyTree) { + input.attrs.insert_or_assign("rev", "0000000000000000000000000000000000000000"); + } + return { Tree(store->toRealPath(storePath), std::move(storePath)), input @@ -211,6 +228,8 @@ struct GitInputScheme : InputScheme auto [isLocal, actualUrl_] = getActualUrl(input); auto actualUrl = actualUrl_; // work around clang bug + bool haveHEAD = true; + // If this is a local directory and no ref or revision is // given, then allow the use of an unclean working tree. if (!input.getRef() && !input.getRev() && isLocal) { @@ -227,10 +246,10 @@ struct GitInputScheme : InputScheme if (commonGitDir != ".git") gitDir = commonGitDir; - bool haveCommits = !readDirectory(gitDir + "/refs/heads").empty(); + haveHEAD = !readDirectory(gitDir + "/refs/heads").empty(); try { - if (haveCommits) { + if (haveHEAD) { runProgram("git", true, { "-C", actualUrl, "diff-index", "--quiet", "HEAD", "--" }); clean = true; } @@ -240,7 +259,8 @@ struct GitInputScheme : InputScheme if (!clean) { - /* This is an unclean working tree. So copy all tracked files. */ + /* This is an unclean working tree. We can't use the worktree + files, because those may be smudged. */ if (!settings.allowDirty) throw Error("Git tree '%s' is dirty", actualUrl); @@ -248,44 +268,66 @@ struct GitInputScheme : InputScheme if (settings.warnDirty) warn("Git tree '%s' is dirty", actualUrl); - auto gitOpts = Strings({ "-C", actualUrl, "ls-files", "-z" }); - if (submodules) - gitOpts.emplace_back("--recurse-submodules"); - - auto files = tokenizeString>( - runProgram("git", true, gitOpts), "\0"s); - - PathFilter filter = [&](const Path & p) -> bool { - assert(hasPrefix(p, actualUrl)); - std::string file(p, actualUrl.size() + 1); - - auto st = lstat(p); - - if (S_ISDIR(st.st_mode)) { - auto prefix = file + "/"; - auto i = files.lower_bound(prefix); - return i != files.end() && hasPrefix(*i, prefix); - } - - return files.count(file); - }; + isDirtyTree = true; + + // We can't use an existing file for the temporary git index, + // so we need to use a tmpdir instead of a tmpfile. + // Non-submodule changes are captured by the tree we build using + // this temporary index. + Path tmpIndexDir = createTempDir(); + AutoDelete delTmpIndexDir(tmpIndexDir, true); + Path tmpIndex = tmpIndexDir + "/tmp-git-index"; + + std::set files = tokenizeString>( + runProgram("git", true, { "-C", actualUrl, "ls-files", "-z" }), + "\0"s); + + { + RunOptions gitOptions("git", { "-C", actualUrl, "add", "--no-warn-embedded-repo", "--" }); + auto env = getEnv(); + env["GIT_INDEX_FILE"] = tmpIndex; + gitOptions.environment = env; + for (auto file : files) + gitOptions.args.push_back(file); + + auto result = runProgram(gitOptions); + if (result.first) + throw ExecError(result.first, fmt("program git add -u %1%", statusToString(result.first))); + } + std::string tree; + { + RunOptions gitOptions("git", { "-C", actualUrl, "write-tree" }); + auto env = getEnv(); + env["GIT_INDEX_FILE"] = tmpIndex; + gitOptions.environment = env; + + auto result = runProgram(gitOptions); + if (result.first) + throw ExecError(result.first, fmt("program git write-tree %1%", statusToString(result.first))); + tree = trim(result.second); + + // Note [tree as rev] + // We set `rev` to a tree object, even if it's normally a + // commit object. This way, we get some use out of the + // cache, to avoid copying files unnecessarily. + input.attrs.insert_or_assign("rev", trim(result.second)); + } - auto storePath = store->addToStore("source", actualUrl, FileIngestionMethod::Recursive, htSHA256, filter); + // Use a diff to gather submodule changes as well. See `dirtySubmoduleDiff` + if (submodules) { + RunOptions gitOptions("git", { "-C", actualUrl, "diff", tree, "--submodule=diff" }); - // FIXME: maybe we should use the timestamp of the last - // modified dirty file? - input.attrs.insert_or_assign( - "lastModified", - haveCommits ? std::stoull(runProgram("git", true, { "-C", actualUrl, "log", "-1", "--format=%ct", "--no-show-signature", "HEAD" })) : 0); + auto result = runProgram(gitOptions); + if (result.first) + throw ExecError(result.first, fmt("program git diff %1%", statusToString(result.first))); - return { - Tree(store->toRealPath(storePath), std::move(storePath)), - input - }; + dirtySubmoduleDiff = result.second; + } } } - if (!input.getRef()) input.attrs.insert_or_assign("ref", isLocal ? readHead(actualUrl) : "master"); + if (!input.getRef() && haveHEAD) + input.attrs.insert_or_assign("ref", isLocal ? readHead(actualUrl) : "master"); Attrs mutableAttrs({ {"type", cacheType}, @@ -406,49 +448,96 @@ struct GitInputScheme : InputScheme AutoDelete delTmpDir(tmpDir, true); PathFilter filter = defaultPathFilter; - RunOptions checkCommitOpts( - "git", - { "-C", repoDir, "cat-file", "commit", input.getRev()->gitRev() } - ); - checkCommitOpts.searchPath = true; - checkCommitOpts.mergeStderrToStdout = true; - - auto result = runProgram(checkCommitOpts); - if (WEXITSTATUS(result.first) == 128 - && result.second.find("bad file") != std::string::npos - ) { - throw Error( - "Cannot find Git revision '%s' in ref '%s' of repository '%s'! " - "Please make sure that the " ANSI_BOLD "rev" ANSI_NORMAL " exists on the " - ANSI_BOLD "ref" ANSI_NORMAL " you've specified or add " ANSI_BOLD - "allRefs = true;" ANSI_NORMAL " to " ANSI_BOLD "fetchGit" ANSI_NORMAL ".", - input.getRev()->gitRev(), - *input.getRef(), - actualUrl + // Skip check if rev is set to a tree object. See Note [tree as rev] + if (!isDirtyTree) { + RunOptions checkCommitOpts( + "git", + { "-C", repoDir, "cat-file", "commit", input.getRev()->gitRev() } ); + checkCommitOpts.searchPath = true; + checkCommitOpts.mergeStderrToStdout = true; + + auto result = runProgram(checkCommitOpts); + if (WEXITSTATUS(result.first) == 128 + && result.second.find("bad file") != std::string::npos + ) { + throw Error( + "Cannot find Git revision '%s' in ref '%s' of repository '%s'! " + "Please make sure that the " ANSI_BOLD "rev" ANSI_NORMAL " exists on the " + ANSI_BOLD "ref" ANSI_NORMAL " you've specified or add " ANSI_BOLD + "allRefs = true;" ANSI_NORMAL " to " ANSI_BOLD "fetchGit" ANSI_NORMAL ".", + input.getRev()->gitRev(), + *input.getRef(), + actualUrl + ); + } } if (submodules) { Path tmpGitDir = createTempDir(); AutoDelete delTmpGitDir(tmpGitDir, true); + // For this checkout approach, we need a commit, not just a treeish. + if (isDirtyTree) { + RunOptions gitOptions("git", { "-C", actualUrl, "commit-tree", "-m", "temporary commit for dirty tree", input.getRev()->gitRev() }); + auto result = runProgram(gitOptions); + if (result.first) + throw ExecError(result.first, fmt("program git commit-tree %1%", statusToString(result.first))); + input.attrs.insert_or_assign("rev", trim(result.second)); + } + runProgram("git", true, { "init", tmpDir, "--separate-git-dir", tmpGitDir }); // TODO: repoDir might lack the ref (it only checks if rev // exists, see FIXME above) so use a big hammer and fetch // everything to ensure we get the rev. runProgram("git", true, { "-C", tmpDir, "fetch", "--quiet", "--force", - "--update-head-ok", "--", repoDir, "refs/*:refs/*" }); + "--update-head-ok", "--", repoDir, "refs/*:refs/*", + input.getRev()->gitRev() }); runProgram("git", true, { "-C", tmpDir, "checkout", "--quiet", input.getRev()->gitRev() }); runProgram("git", true, { "-C", tmpDir, "remote", "add", "origin", actualUrl }); runProgram("git", true, { "-C", tmpDir, "submodule", "--quiet", "update", "--init", "--recursive" }); + if (dirtySubmoduleDiff.size()) { + RunOptions gitOptions("git", { "-C", tmpDir, "apply" }); + StringSource s(dirtySubmoduleDiff); + gitOptions.standardIn = &s; + auto result = runProgram(gitOptions); + if (result.first) + throw ExecError(result.first, fmt("program git apply %1%", statusToString(result.first))); + } + filter = isNotDotGitDirectory; } else { + Strings noSmudgeOptions; + { + RunOptions gitOptions("git", { "-C", repoDir, "config", "-l" }); + auto result = runProgram(gitOptions); + auto ss = std::stringstream{result.second}; + StringSet filters; + + for (std::string line; std::getline(ss, line, '\n');) { + std::string prefix = "filter."; + std::string infix = ".smudge="; + auto infixPos = line.find(infix); + if (hasPrefix(line, prefix) && infixPos != std::string::npos) { + filters.emplace(line.substr(prefix.size(), infixPos - prefix.size())); + } + } + for (auto filter : filters) { + noSmudgeOptions.emplace_back("-c"); + noSmudgeOptions.emplace_back("filter." + filter + ".smudge=cat --"); + } + } + // FIXME: should pipe this, or find some better way to extract a // revision. auto source = sinkToSource([&](Sink & sink) { - RunOptions gitOptions("git", { "-C", repoDir, "archive", input.getRev()->gitRev() }); + RunOptions gitOptions("git", noSmudgeOptions); + gitOptions.args.push_back("-C"); + gitOptions.args.push_back(repoDir); + gitOptions.args.push_back("archive"); + gitOptions.args.push_back(input.getRev()->gitRev()); gitOptions.standardOut = &sink; runProgram2(gitOptions); }); @@ -458,7 +547,13 @@ struct GitInputScheme : InputScheme auto storePath = store->addToStore(name, tmpDir, FileIngestionMethod::Recursive, htSHA256, filter); - auto lastModified = std::stoull(runProgram("git", true, { "-C", repoDir, "log", "-1", "--format=%ct", "--no-show-signature", input.getRev()->gitRev() })); + const auto now = std::chrono::system_clock::now(); + + // FIXME: when isDirtyTree, maybe we should use the timestamp + // of the last modified dirty file? + auto lastModified = isDirtyTree ? + (std::chrono::duration_cast(now.time_since_epoch()).count()) + : std::stoull(runProgram("git", true, { "-C", repoDir, "log", "-1", "--format=%ct", "--no-show-signature", input.getRev()->gitRev() })); Attrs infoAttrs({ {"rev", input.getRev()->gitRev()}, diff --git a/tests/fetchGit.sh b/tests/fetchGit.sh index 88744ee7fe8..ad8d84261ed 100644 --- a/tests/fetchGit.sh +++ b/tests/fetchGit.sh @@ -8,14 +8,19 @@ fi clearStore repo=$TEST_ROOT/git +submodule_upstream=$TEST_ROOT/submodule-upstream export _NIX_FORCE_HTTP=1 rm -rf $repo ${repo}-tmp $TEST_HOME/.cache/nix $TEST_ROOT/worktree $TEST_ROOT/shallow -git init $repo -git -C $repo config user.email "foobar@example.com" -git -C $repo config user.name "Foobar" +init_repo() { + git init $1 + git -C $1 config user.email "foobar@example.com" + git -C $1 config user.name "Foobar" +} + +init_repo $repo echo utrecht > $repo/hello touch $repo/.gitignore @@ -180,12 +185,153 @@ path6=$(nix eval --impure --raw --expr "(builtins.fetchTree { type = \"git\"; ur [[ $path3 = $path6 ]] [[ $(nix eval --impure --expr "(builtins.fetchTree { type = \"git\"; url = \"file://$TEST_ROOT/shallow\"; ref = \"dev\"; shallow = true; }).revCount or 123") == 123 ]] -# Explicit ref = "HEAD" should work, and produce the same outPath as without ref -path7=$(nix eval --impure --raw --expr "(builtins.fetchGit { url = \"file://$repo\"; ref = \"HEAD\"; }).outPath") -path8=$(nix eval --impure --raw --expr "(builtins.fetchGit { url = \"file://$repo\"; }).outPath") +# Adding a git filter does not affect the contents +# +# Background +# ========== +# +# Git filters allow the user to change how files are represented +# in the worktree. +# On checkout, the configured smudge converts blobs to files in the worktree. +# On checkin, the configured clean command converts files back into blobs. +# +# Notable uses include +# - allow the user to work with a platform-specific representation, conveniently +# - git-crypt: only allow some users to see file contents, transparently +# - git-lfs: work with large files without inflating the repository +# +# See also https://git-scm.com/docs/gitattributes#_filter +# +# Why ignore filters +# ================== +# +# To quote the git docs +# +# > the intent is that if someone unsets the filter driver definition, or +# > does not have the appropriate filter program, the project should still +# > be usable. +# +# So the feature was designed to be optional. This confirms that we have a +# choice. Let's look at the individual use cases. +# +# Allow the user to work with a platform-specific representation +# -------------------------------------------------------------- +# +# While this might seem convenient, any such processing can also be done in +# `postUnpack`, so it isn't necessary here. +# Tarballs from GitHub and such don't apply the smudge filter either, so if +# the project is going to be packaged in Nixpkgs, it will have to process its +# files like this anyway. +# The real kicker here is that running the smudge filter creates an +# unreproducible dependency, because the filter does not come from a pinned +# immutable source and it could inject information from arbitrary sources. +# +# Git-crypt +# --------- +# +# The nix store can be read by any process on the system, or in some cases, +# when using a cache, literally world-readable. +# Running the filters in fetchGit would essentially make impossible the use of +# git-crypt and Nix flakes in the same repository. +# Even without flakes (or with changes to the flakes feature for that matter), +# the software you want to build generally does not depend on credentials, so +# not decrypting is not only a secure default, but a good one. +# In a rare case where a build does not to decrypt a git-crypted file, one could +# still pass the decrypted file or the git-crypt key explicitly (at the cost of +# exposing it in the store, which is inevitable for nix-built paths). +# +# Git LFS +# ------- +# +# Git LFS was designed to prevent excessive bloat in a repository, so the +# "smudged" versions of these files will be huge. +# +# If we were to include these directly in the `fetchGit` output, this creates +# copies of all the large files for each commit we check out, or even for +# each uncommitted but built local change (with fetchGit ./.). +# +# In many cases, those files may not even be used in the build process. If +# they are required, it seems feasible to fetch them explicitly with a +# fetcher that fetches from LFS based on the sha256 in the unsmudged files. +# It is more fine grained than downloading all LFS files and it does not even +# require IFD because it happens after fetchGit, which runs at evaluation time. +# +# If for some reason LFS support can not be achieved in Nix expressions, we +# should add support for LFS itself, without running any other filters. +# +# Conclusion +# ========== +# +# Not running the filters is more reproducible, secure and potentially more +# efficient than running them. +git -C $repo checkout master +cat >>$repo/.git/config < /' + smudge = sed -e 's/^> //' +EOF +cat >$repo/.gitattributes <$repo/einstein.q +git -C $repo add $repo/.gitattributes $repo/einstein.q +git -C $repo commit -m 'Add Einstein quote' +rev4=$(git -C $repo rev-parse HEAD) +git clone --bare $repo $repo.bare +path7=$(nix eval --impure --raw --expr "(builtins.fetchGit { url = $repo.bare; }).outPath") +cmp $path7/einstein.q <(echo "> Insanity is building the same thing over and over and expecting different results.") + +# Files are clean when fetching from a local repo with ref. +path8=$(nix eval --impure --raw --expr "(builtins.fetchGit { url = $repo; rev = \"$rev4\"; }).outPath") [[ $path7 = $path8 ]] +# Files are clean when fetching from a local repo with local changes without ref or rev and submodules = true. +echo "All impurity needs to gain a foothold is for people of good conscience to remain unaware of undeclared inputs." >$repo/jefferson.q +git -C $repo add -N $repo/jefferson.q + +path8=$(nix eval --impure --raw --expr "(builtins.fetchGit { url = $repo; submodules = true; }).outPath") +cmp $path8/jefferson.q <(echo "> All impurity needs to gain a foothold is for people of good conscience to remain unaware of undeclared inputs.") + +# Files are clean when fetching from a local repo with local changes without ref or rev. +path9=$(nix eval --impure --raw --expr "(builtins.fetchGit $repo).outPath") +cmp $path9/jefferson.q <(echo "> All impurity needs to gain a foothold is for people of good conscience to remain unaware of undeclared inputs.") + +# Changes in submodules are included +init_repo $submodule_upstream +echo "# Auxiliary quotes" >$submodule_upstream/README.md +git -C $submodule_upstream add $submodule_upstream/README.md +git -C $submodule_upstream commit -m 'First commit' +git -C $repo submodule add $submodule_upstream aux-quotes +echo > $repo/aux-quotes/wisdom "Teach a man to write hello and you'll confuse him for a day. Teach a man to write hello world, and you'll confuse him for a lifetime." +echo > $repo/aux-quotes/actual-einstein "Nix doesn't play dice." +git -C $repo/aux-quotes add -N $repo/aux-quotes/wisdom +git -C $repo/aux-quotes add $repo/aux-quotes/actual-einstein +# git -C $repo diff --submodule=diff +path10=$(nix eval --impure --raw --expr "(builtins.fetchGit { url = $repo; submodules = true; }).outPath") +cmp $path10/aux-quotes/wisdom <(echo "Teach a man to write hello and you'll confuse him for a day. Teach a man to write hello world, and you'll confuse him for a lifetime.") +cmp $path10/aux-quotes/actual-einstein <(echo "Nix doesn't play dice.") + # ref = "HEAD" should fetch the HEAD revision rev4=$(git -C $repo rev-parse HEAD) rev4_nix=$(nix eval --impure --raw --expr "(builtins.fetchGit { url = \"file://$repo\"; ref = \"HEAD\"; }).rev") [[ $rev4 = $rev4_nix ]] + +# Fetching "HEAD" results in the same paths as fetching the rev it points to +path11=$(nix eval --impure --raw --expr "(builtins.fetchGit { url = \"file://$repo\"; ref = \"HEAD\"; }).outPath") +path12=$(nix eval --impure --raw --expr "(builtins.fetchGit { url = \"file://$repo\"; ref = \"$rev4\"; }).outPath") +[[ $path11 = $path12 ]] + +# Explicit ref = "HEAD" should work, and produce the same outPath as without ref, if no changes are present +git -C $repo reset . +git -C $repo stash -u +git -C $repo status +path13=$(nix eval --impure --raw --expr "(builtins.fetchGit { url = \"file://$repo\"; ref = \"HEAD\"; }).outPath") +path14=$(nix eval --impure --raw --expr "(builtins.fetchGit { url = \"file://$repo\"; }).outPath") +diff --color=always -U3 $path13 $path14 +[[ $path13 = $path14 ]] + +# Fetching a revision does not include uncommitted changes +[[ $path11 = $path13 ]]