Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unsmudge fetchGit for reproducibility and security #4635

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions src/libexpr/primops/fetchTree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,23 @@ static RegisterPrimOp primop_fetchGit({
- url\
The URL of the repo.

The URL can be a path value, in order to fetch from a local
repository. When `ref` and `rev` unset, the default behavior is to
produce an output that includes the uncommitted changes to files known
to git. This is similar to `HEAD` after running `git add -u` and
`git commit`. Files that have not been added to the index or `HEAD`
will be ignored.

This will not run the git smudge filters, but changed files will be
processed with the git clean filter, in order to be consistent with
remote repositories. This way, decrypted git-crypt secrets are not
added to the store. Warning: the current implementation does not
apply this logic to submodules.

Tip: if you prefer to keep your index clean, you can use `git add -N`
to add files to the index without adding their contents. You can add
the contents later with `git add -p` or `git add`.

- name\
The name of the directory the repo should be exported to in the
store. Defaults to the basename of the URL.
Expand All @@ -315,11 +332,13 @@ static RegisterPrimOp primop_fetchGit({

- ref\
The git ref to look for the requested revision under. This is
often a branch or tag name. Defaults to `HEAD`.
often a branch or tag name. For remote repositories, this defaults
to `HEAD`. If the `url` is a path value, some local changes are
included; see `url`.

By default, the `ref` value is prefixed with `refs/heads/`. As
of Nix 2.3.0 Nix will not prefix `refs/heads/` if `ref` starts
with `refs/`.
with `refs/`. `HEAD` is also not prefixed.

- submodules\
A Boolean parameter that specifies whether submodules should be
Expand Down
209 changes: 152 additions & 57 deletions src/libfetchers/git.cc
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,16 @@ struct GitInputScheme : InputScheme
if (submodules) cacheType += "-submodules";
if (allRefs) cacheType += "-all-refs";

bool isDirtyTree = false;

// TODO: Recursively use the `write-tree` logic for the submodules.
// For now, we use a diff to get the uncommitted submodule
// changes. This only works correctly in cases where the diff
// does not depend on smudge/clean behavior, which we can't
// assume. The submodule worktree does come from a fresh repo,
// so at least it seems that git-crypt security is not at risk.
std::string dirtySubmoduleDiff;

auto getImmutableAttrs = [&]()
{
return Attrs({
Expand All @@ -197,6 +207,13 @@ struct GitInputScheme : InputScheme
if (!shallow)
input.attrs.insert_or_assign("revCount", getIntAttr(infoAttrs, "revCount"));
input.attrs.insert_or_assign("lastModified", getIntAttr(infoAttrs, "lastModified"));

// If the tree is dirty, we use a tree hash internally, but we don't
// want to expose it.
if (isDirtyTree) {
input.attrs.insert_or_assign("rev", "0000000000000000000000000000000000000000");
}

return {
Tree(store->toRealPath(storePath), std::move(storePath)),
input
Expand All @@ -211,6 +228,8 @@ struct GitInputScheme : InputScheme
auto [isLocal, actualUrl_] = getActualUrl(input);
auto actualUrl = actualUrl_; // work around clang bug

bool haveHEAD = true;

// If this is a local directory and no ref or revision is
// given, then allow the use of an unclean working tree.
if (!input.getRef() && !input.getRev() && isLocal) {
Expand All @@ -227,10 +246,10 @@ struct GitInputScheme : InputScheme
if (commonGitDir != ".git")
gitDir = commonGitDir;

bool haveCommits = !readDirectory(gitDir + "/refs/heads").empty();
haveHEAD = !readDirectory(gitDir + "/refs/heads").empty();

try {
if (haveCommits) {
if (haveHEAD) {
runProgram("git", true, { "-C", actualUrl, "diff-index", "--quiet", "HEAD", "--" });
clean = true;
}
Expand All @@ -240,52 +259,75 @@ struct GitInputScheme : InputScheme

if (!clean) {

/* This is an unclean working tree. So copy all tracked files. */
/* This is an unclean working tree. We can't use the worktree
files, because those may be smudged. */

if (!settings.allowDirty)
throw Error("Git tree '%s' is dirty", actualUrl);

if (settings.warnDirty)
warn("Git tree '%s' is dirty", actualUrl);

auto gitOpts = Strings({ "-C", actualUrl, "ls-files", "-z" });
if (submodules)
gitOpts.emplace_back("--recurse-submodules");

auto files = tokenizeString<std::set<std::string>>(
runProgram("git", true, gitOpts), "\0"s);

PathFilter filter = [&](const Path & p) -> bool {
assert(hasPrefix(p, actualUrl));
std::string file(p, actualUrl.size() + 1);

auto st = lstat(p);

if (S_ISDIR(st.st_mode)) {
auto prefix = file + "/";
auto i = files.lower_bound(prefix);
return i != files.end() && hasPrefix(*i, prefix);
}

return files.count(file);
};
isDirtyTree = true;

// We can't use an existing file for the temporary git index,
// so we need to use a tmpdir instead of a tmpfile.
// Non-submodule changes are captured by the tree we build using
// this temporary index.
Path tmpIndexDir = createTempDir();
AutoDelete delTmpIndexDir(tmpIndexDir, true);
Path tmpIndex = tmpIndexDir + "/tmp-git-index";

std::set<Path> files = tokenizeString<std::set<std::string>>(
runProgram("git", true, { "-C", actualUrl, "ls-files", "-z" }),
"\0"s);

{
RunOptions gitOptions("git", { "-C", actualUrl, "add", "--no-warn-embedded-repo", "--" });
auto env = getEnv();
env["GIT_INDEX_FILE"] = tmpIndex;
gitOptions.environment = env;
for (auto file : files)
gitOptions.args.push_back(file);

auto result = runProgram(gitOptions);
if (result.first)
throw ExecError(result.first, fmt("program git add -u %1%", statusToString(result.first)));
}
std::string tree;
{
RunOptions gitOptions("git", { "-C", actualUrl, "write-tree" });
auto env = getEnv();
env["GIT_INDEX_FILE"] = tmpIndex;
gitOptions.environment = env;

auto result = runProgram(gitOptions);
if (result.first)
throw ExecError(result.first, fmt("program git write-tree %1%", statusToString(result.first)));
tree = trim(result.second);

// Note [tree as rev]
// We set `rev` to a tree object, even if it's normally a
// commit object. This way, we get some use out of the
// cache, to avoid copying files unnecessarily.
input.attrs.insert_or_assign("rev", trim(result.second));
}

auto storePath = store->addToStore("source", actualUrl, FileIngestionMethod::Recursive, htSHA256, filter);
// Use a diff to gather submodule changes as well. See `dirtySubmoduleDiff`
if (submodules) {
RunOptions gitOptions("git", { "-C", actualUrl, "diff", tree, "--submodule=diff" });

// FIXME: maybe we should use the timestamp of the last
// modified dirty file?
input.attrs.insert_or_assign(
"lastModified",
haveCommits ? std::stoull(runProgram("git", true, { "-C", actualUrl, "log", "-1", "--format=%ct", "--no-show-signature", "HEAD" })) : 0);
auto result = runProgram(gitOptions);
if (result.first)
throw ExecError(result.first, fmt("program git diff %1%", statusToString(result.first)));

return {
Tree(store->toRealPath(storePath), std::move(storePath)),
input
};
dirtySubmoduleDiff = result.second;
}
}
}

if (!input.getRef()) input.attrs.insert_or_assign("ref", isLocal ? readHead(actualUrl) : "master");
if (!input.getRef() && haveHEAD)
input.attrs.insert_or_assign("ref", isLocal ? readHead(actualUrl) : "master");

Attrs mutableAttrs({
{"type", cacheType},
Expand Down Expand Up @@ -406,49 +448,96 @@ struct GitInputScheme : InputScheme
AutoDelete delTmpDir(tmpDir, true);
PathFilter filter = defaultPathFilter;

RunOptions checkCommitOpts(
"git",
{ "-C", repoDir, "cat-file", "commit", input.getRev()->gitRev() }
);
checkCommitOpts.searchPath = true;
checkCommitOpts.mergeStderrToStdout = true;

auto result = runProgram(checkCommitOpts);
if (WEXITSTATUS(result.first) == 128
&& result.second.find("bad file") != std::string::npos
) {
throw Error(
"Cannot find Git revision '%s' in ref '%s' of repository '%s'! "
"Please make sure that the " ANSI_BOLD "rev" ANSI_NORMAL " exists on the "
ANSI_BOLD "ref" ANSI_NORMAL " you've specified or add " ANSI_BOLD
"allRefs = true;" ANSI_NORMAL " to " ANSI_BOLD "fetchGit" ANSI_NORMAL ".",
input.getRev()->gitRev(),
*input.getRef(),
actualUrl
// Skip check if rev is set to a tree object. See Note [tree as rev]
if (!isDirtyTree) {
RunOptions checkCommitOpts(
"git",
{ "-C", repoDir, "cat-file", "commit", input.getRev()->gitRev() }
);
checkCommitOpts.searchPath = true;
checkCommitOpts.mergeStderrToStdout = true;

auto result = runProgram(checkCommitOpts);
if (WEXITSTATUS(result.first) == 128
&& result.second.find("bad file") != std::string::npos
) {
throw Error(
"Cannot find Git revision '%s' in ref '%s' of repository '%s'! "
"Please make sure that the " ANSI_BOLD "rev" ANSI_NORMAL " exists on the "
ANSI_BOLD "ref" ANSI_NORMAL " you've specified or add " ANSI_BOLD
"allRefs = true;" ANSI_NORMAL " to " ANSI_BOLD "fetchGit" ANSI_NORMAL ".",
input.getRev()->gitRev(),
*input.getRef(),
actualUrl
);
}
}

if (submodules) {
Path tmpGitDir = createTempDir();
AutoDelete delTmpGitDir(tmpGitDir, true);

// For this checkout approach, we need a commit, not just a treeish.
if (isDirtyTree) {
RunOptions gitOptions("git", { "-C", actualUrl, "commit-tree", "-m", "temporary commit for dirty tree", input.getRev()->gitRev() });
auto result = runProgram(gitOptions);
if (result.first)
throw ExecError(result.first, fmt("program git commit-tree %1%", statusToString(result.first)));
input.attrs.insert_or_assign("rev", trim(result.second));
}

runProgram("git", true, { "init", tmpDir, "--separate-git-dir", tmpGitDir });
// TODO: repoDir might lack the ref (it only checks if rev
// exists, see FIXME above) so use a big hammer and fetch
// everything to ensure we get the rev.
runProgram("git", true, { "-C", tmpDir, "fetch", "--quiet", "--force",
"--update-head-ok", "--", repoDir, "refs/*:refs/*" });
"--update-head-ok", "--", repoDir, "refs/*:refs/*",
input.getRev()->gitRev() });

runProgram("git", true, { "-C", tmpDir, "checkout", "--quiet", input.getRev()->gitRev() });
runProgram("git", true, { "-C", tmpDir, "remote", "add", "origin", actualUrl });
runProgram("git", true, { "-C", tmpDir, "submodule", "--quiet", "update", "--init", "--recursive" });

if (dirtySubmoduleDiff.size()) {
RunOptions gitOptions("git", { "-C", tmpDir, "apply" });
StringSource s(dirtySubmoduleDiff);
gitOptions.standardIn = &s;
auto result = runProgram(gitOptions);
if (result.first)
throw ExecError(result.first, fmt("program git apply %1%", statusToString(result.first)));
}

filter = isNotDotGitDirectory;
} else {
Strings noSmudgeOptions;
{
RunOptions gitOptions("git", { "-C", repoDir, "config", "-l" });
auto result = runProgram(gitOptions);
auto ss = std::stringstream{result.second};
StringSet filters;

for (std::string line; std::getline(ss, line, '\n');) {
Copy link
Contributor

@greedy greedy Sep 30, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

git config values can have multiple lines (try git -c $'foo=two\nlines' config -l to see for yourself), so splitting lines here can cause incorrect results. I suggest to use git config --name-only --get-regexp '^filter\..*\.(smudge|process)$' to avoid this issue (config keys cannot contain newlines so splitting on newlines is now acceptable) and, as a bonus, only return relevant names.

Additionally, it isn't necessary to set the smudge filter to cat. If the filter is set to an empty string then no filtering is done.

With these two changes, noSmudgeOptions can be constructed by just appending = to the names returned from the git config invocation.

std::string prefix = "filter.";
std::string infix = ".smudge=";
auto infixPos = line.find(infix);
if (hasPrefix(line, prefix) && infixPos != std::string::npos) {
filters.emplace(line.substr(prefix.size(), infixPos - prefix.size()));
}
}
for (auto filter : filters) {
noSmudgeOptions.emplace_back("-c");
noSmudgeOptions.emplace_back("filter." + filter + ".smudge=cat --");
}
}

// FIXME: should pipe this, or find some better way to extract a
// revision.
auto source = sinkToSource([&](Sink & sink) {
RunOptions gitOptions("git", { "-C", repoDir, "archive", input.getRev()->gitRev() });
RunOptions gitOptions("git", noSmudgeOptions);
gitOptions.args.push_back("-C");
gitOptions.args.push_back(repoDir);
gitOptions.args.push_back("archive");
gitOptions.args.push_back(input.getRev()->gitRev());
gitOptions.standardOut = &sink;
runProgram2(gitOptions);
});
Expand All @@ -458,7 +547,13 @@ struct GitInputScheme : InputScheme

auto storePath = store->addToStore(name, tmpDir, FileIngestionMethod::Recursive, htSHA256, filter);

auto lastModified = std::stoull(runProgram("git", true, { "-C", repoDir, "log", "-1", "--format=%ct", "--no-show-signature", input.getRev()->gitRev() }));
const auto now = std::chrono::system_clock::now();

// FIXME: when isDirtyTree, maybe we should use the timestamp
// of the last modified dirty file?
auto lastModified = isDirtyTree ?
(std::chrono::duration_cast<std::chrono::seconds>(now.time_since_epoch()).count())
: std::stoull(runProgram("git", true, { "-C", repoDir, "log", "-1", "--format=%ct", "--no-show-signature", input.getRev()->gitRev() }));

Attrs infoAttrs({
{"rev", input.getRev()->gitRev()},
Expand Down
Loading