From 0bfd95159cf02ea5b7f0368bd5e53db455f9bc4e Mon Sep 17 00:00:00 2001 From: gabrie30 Date: Sun, 15 Sep 2024 12:28:40 -0700 Subject: [PATCH] Add/ghorg stats (#449) --- CHANGELOG.md | 7 ++ README.md | 28 +++++++- cmd/clone.go | 165 +++++++++++++++++++++++++++++++++++++++++++---- cmd/root.go | 5 ++ sample-conf.yaml | 13 +++- 5 files changed, 203 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8773ce..eebefec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,11 +6,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) ## [1.9.14] - unreleased ### Added - GHORG_NO_DIR_SIZE flag to turn off directory size output which is now enabled by default +- GHORG_STATS_ENABLED flag to track clone data over time, set to false by default ### Changed ### Deprecated ### Removed ### Fixed ### Security +- Bump code.gitea.io/sdk/gitea from 0.18.0 to 0.19.0 (#441) +- Bump github.com/xanzy/go-gitlab from 0.106.0 to 0.107.0 (#442) +- Bump rexml from 3.2.8 to 3.3.3 in /site (#443) +- Bump rexml from 3.3.3 to 3.3.6 in /site (#444) +- Bump golang.org/x/oauth2 from 0.21.0 to 0.22.0 (#447) +- Bump github.com/xanzy/go-gitlab from 0.107.0 to 0.108.0 (#446) ## [1.9.13] - 7/20/2024 ### Added diff --git a/README.md b/README.md index bf62cd8..6097dfb 100644 --- a/README.md +++ b/README.md @@ -334,6 +334,32 @@ Alternatively, Windows users can also install ghorg using [scoop](https://scoop. scoop install ghorg ``` +## Tracking Clone Data Over Time + +To track data on your clones over time, you can use the ghorg stats feature. It is recommended to enable ghorg stats in your configuration file by setting `GHORG_STATS_ENABLED=true`. This ensures that each clone operation is logged automatically without needing to set the command line flag `--stats-enabled` every time. **The ghorg stats feature is disabled by default and needs to be enabled.** + +When ghorg stats is enabled, the CSV file `_ghorg_stats.csv` is created in the directory specified by `GHORG_ABSOLUTE_PATH_TO_CLONE_TO`. This file contains detailed information about each clone operation, which is useful for auditing and tracking purposes such as the size of the clone and the number of new commits over time. + +Below are the headers and their descriptions. Note that these headers may change over time. If there are any changes in the headers, a new file named `_ghorg_stats_new_header_${sha256HashOfHeader}.csv` will be created to prevent incorrect data from being added to your CSV. + +- **datetime**: Date and time of the clone in YYYY-MM-DD hh:mm:ss format +- **clonePath**: Location of the clone directory +- **scm**: Name of the source control used +- **cloneType**: Either user or org clone +- **cloneTarget**: What is specified after the clone command `ghorg clone ` +- **totalCount**: Total number of resources expected to be cloned or pulled +- **newClonesCount**: Sum of all new repos cloned +- **existingResourcesPulledCount**: Sum of all repos that were pulled +- **dirSizeInMB**: The size in megabytes of the output dir +- **newCommits**: Sum of all new commits in all repos pulled or cloned +- **cloneInfosCount**: Number of clone Info messages +- **cloneErrorsCount**: Number of clone Issues/Errors +- **updateRemoteCount**: Number of remotes updated +- **pruneCount**: Number of repos pruned +- **hasCollisions**: If there were any name collisions, only can happen with gitlab clones +- **ghorgignore**: If a ghorgignore was used in the clone +- **ghorgVersion**: Version of ghorg used in the clone + ## Troubleshooting - If you are having trouble cloning repos. Try to clone one of the repos locally e.g. manually running `git clone https://github.com/your_private_org/your_private_repo.git` if this does not work, ghorg will also not work. Your git client must first be setup to clone the target repos. If you normally clone using an ssh key use the `--protocol=ssh` flag with ghorg. This will fetch the ssh clone urls instead of the https clone urls. @@ -342,5 +368,5 @@ Alternatively, Windows users can also install ghorg using [scoop](https://scoop. - If your GitHub Personal Access Token is only finding public repos, give your token all the repos permissions - Make sure your `$ git --version` is >= 2.19.0 - Check for other software, such as anti-malware, that could interfere with ghorgs ability to create large number of connections, see [issue 132](https://github.com/gabrie30/ghorg/issues/132#issuecomment-889357960). You can also lower the concurrency with `--concurrency=n` default is 25. -- To debug yourself you can call ghorg with the GHORG_DEBUG=true env e.g `GHORG_DEBUG=true ghorg clone kubernetes`. Note, when this env is set concurrency is set to a value of 1 +- To debug yourself you can call ghorg with the GHORG_DEBUG=true env e.g `GHORG_DEBUG=true ghorg clone kubernetes`. Note, when this env is set concurrency is set to a value of 1 and will expose the api key used to stdout. - If you've gotten this far and still have an issue feel free to raise an issue diff --git a/cmd/clone.go b/cmd/clone.go index b2def4b..d2ba94e 100644 --- a/cmd/clone.go +++ b/cmd/clone.go @@ -3,6 +3,7 @@ package cmd import ( "bufio" + "crypto/sha256" "fmt" "log" "net/url" @@ -12,6 +13,7 @@ import ( "strconv" "strings" "sync" + "time" "github.com/gabrie30/ghorg/colorlog" "github.com/gabrie30/ghorg/configs" @@ -37,6 +39,9 @@ Or see examples directory at https://github.com/gabrie30/ghorg/tree/master/examp Run: cloneFunc, } +var cachedDirSizeMB float64 +var isDirSizeCached bool + func cloneFunc(cmd *cobra.Command, argz []string) { if cmd.Flags().Changed("path") { absolutePath := configs.EnsureTrailingSlashOnFilePath((cmd.Flag("path").Value.String())) @@ -157,6 +162,10 @@ func cloneFunc(cmd *cobra.Command, argz []string) { os.Setenv("GHORG_SKIP_ARCHIVED", "true") } + if cmd.Flags().Changed("stats-enabled") { + os.Setenv("GHORG_STATS_ENABLED", "true") + } + if cmd.Flags().Changed("no-clean") { os.Setenv("GHORG_NO_CLEAN", "true") } @@ -914,9 +923,13 @@ func CloneAllRepos(git git.Gitter, cloneTargets []scm.Repo) { } } + var pruneCount int + cloneInfosCount := len(cloneInfos) + cloneErrorsCount := len(cloneErrors) + allReposToCloneCount := len(cloneTargets) // Now, clean up local repos that don't exist in remote, if prune flag is set if os.Getenv("GHORG_PRUNE") == "true" { - pruneRepos(cloneTargets) + pruneCount = pruneRepos(cloneTargets) } if os.Getenv("GHORG_QUIET") != "true" { @@ -927,7 +940,13 @@ func CloneAllRepos(git git.Gitter, cloneTargets []scm.Repo) { } } - if os.Getenv("GHORG_EXIT_CODE_ON_CLONE_INFOS") != "0" && len(cloneInfos) > 0 { + // This needs to be called after printFinishedWithDirSize() + if os.Getenv("GHORG_STATS_ENABLED") == "true" { + date := time.Now().Format("2006-01-02 15:04:05") + writeGhorgStats(date, allReposToCloneCount, cloneCount, pulledCount, cloneInfosCount, cloneErrorsCount, updateRemoteCount, newCommits, pruneCount, hasCollisions) + } + + if os.Getenv("GHORG_EXIT_CODE_ON_CLONE_INFOS") != "0" && cloneInfosCount > 0 { exitCode, err := strconv.Atoi(os.Getenv("GHORG_EXIT_CODE_ON_CLONE_INFOS")) if err != nil { colorlog.PrintError("Could not convert GHORG_EXIT_CODE_ON_CLONE_INFOS from string to integer") @@ -937,7 +956,7 @@ func CloneAllRepos(git git.Gitter, cloneTargets []scm.Repo) { os.Exit(exitCode) } - if len(cloneErrors) > 0 { + if cloneErrorsCount > 0 { exitCode, err := strconv.Atoi(os.Getenv("GHORG_EXIT_CODE_ON_CLONE_ISSUES")) if err != nil { colorlog.PrintError("Could not convert GHORG_EXIT_CODE_ON_CLONE_ISSUES from string to integer") @@ -949,21 +968,136 @@ func CloneAllRepos(git git.Gitter, cloneTargets []scm.Repo) { } +func writeGhorgStats(date string, allReposToCloneCount, cloneCount, pulledCount, cloneInfosCount, cloneErrorsCount, updateRemoteCount, newCommits, pruneCount int, hasCollisions bool) error { + statsFilePath := filepath.Join(os.Getenv("GHORG_ABSOLUTE_PATH_TO_CLONE_TO"), "_ghorg_stats.csv") + + fileExists := true + + if _, err := os.Stat(statsFilePath); os.IsNotExist(err) { + fileExists = false + } + + header := "datetime,clonePath,scm,cloneType,cloneTarget,totalCount,newClonesCount,existingResourcesPulledCount,dirSizeInMB,newCommits,cloneInfosCount,cloneErrorsCount,updateRemoteCount,pruneCount,hasCollisions,ghorgignore,ghorgVersion\n" + + var file *os.File + var err error + + if fileExists { + // Read the existing header + existingHeader, err := readFirstLine(statsFilePath) + if err != nil { + colorlog.PrintError(fmt.Sprintf("Error reading header from stats file: %v", err)) + return err + } + + // Check if the existing header is different from the new header, need to add a newline + if existingHeader+"\n" != header { + hashedHeader := fmt.Sprintf("%x", sha256.Sum256([]byte(header))) + newHeaderFilePath := filepath.Join(os.Getenv("GHORG_ABSOLUTE_PATH_TO_CLONE_TO"), fmt.Sprintf("ghorg_stats_new_header_%s.csv", hashedHeader)) + // Create a new file with the new header + file, err = os.OpenFile(newHeaderFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + colorlog.PrintError(fmt.Sprintf("Error creating new header stats file: %v", err)) + return err + } + if _, err := file.WriteString(header); err != nil { + colorlog.PrintError(fmt.Sprintf("Error writing new header to GHORG_STATS file: %v", err)) + return err + } + } else { + // Open the existing file in append mode + file, err = os.OpenFile(statsFilePath, os.O_APPEND|os.O_WRONLY, 0644) + if err != nil { + colorlog.PrintError(fmt.Sprintf("Error opening stats file for appending: %v", err)) + return err + } + } + } else { + // Create the file and write the header + file, err = os.OpenFile(statsFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + colorlog.PrintError(fmt.Sprintf("Error creating stats file: %v", err)) + return err + } + if _, err := file.WriteString(header); err != nil { + colorlog.PrintError(fmt.Sprintf("Error writing header to GHORG_STATS file: %v", err)) + return err + } + } + defer file.Close() + + data := fmt.Sprintf("%v,%v,%v,%v,%v,%v,%v,%v,%.2f,%v,%v,%v,%v,%v,%v,%v,%v\n", + date, + outputDirAbsolutePath, + os.Getenv("GHORG_SCM_TYPE"), + os.Getenv("GHORG_CLONE_TYPE"), + targetCloneSource, + allReposToCloneCount, + cloneCount, + pulledCount, + cachedDirSizeMB, + newCommits, + cloneInfosCount, + cloneErrorsCount, + updateRemoteCount, + pruneCount, + hasCollisions, + configs.GhorgIgnoreDetected(), + GetVersion()) + if _, err := file.WriteString(data); err != nil { + colorlog.PrintError(fmt.Sprintf("Error writing data to GHORG_STATS file: %v", err)) + return err + } + + return nil +} + +func readFirstLine(filePath string) (string, error) { + file, err := os.Open(filePath) + if err != nil { + return "", err + } + defer file.Close() + + scanner := bufio.NewScanner(file) + if scanner.Scan() { + return scanner.Text(), nil + } + if err := scanner.Err(); err != nil { + return "", err + } + + return "", nil +} + func printFinishedWithDirSize() { - dirSizeMB, err := calculateDirSizeInMb(outputDirAbsolutePath) + dirSizeMB, err := getCachedOrCalculatedOutputDirSizeInMb() if err != nil { if os.Getenv("GHORG_DEBUG") == "true" { colorlog.PrintError(fmt.Sprintf("Error calculating directory size: %v", err)) } colorlog.PrintSuccess(fmt.Sprintf("\nFinished! %s", outputDirAbsolutePath)) - } else { - if dirSizeMB > 1000 { - dirSizeGB := dirSizeMB / 1000 - colorlog.PrintSuccess(fmt.Sprintf("\nFinished! %s (Size: %.2f GB)", outputDirAbsolutePath, dirSizeGB)) - } else { - colorlog.PrintSuccess(fmt.Sprintf("\nFinished! %s (Size: %.2f MB)", outputDirAbsolutePath, dirSizeMB)) - } + return } + + if dirSizeMB > 1000 { + dirSizeGB := dirSizeMB / 1000 + colorlog.PrintSuccess(fmt.Sprintf("\nFinished! %s (Size: %.2f GB)", outputDirAbsolutePath, dirSizeGB)) + } else { + colorlog.PrintSuccess(fmt.Sprintf("\nFinished! %s (Size: %.2f MB)", outputDirAbsolutePath, dirSizeMB)) + } +} + +func getCachedOrCalculatedOutputDirSizeInMb() (float64, error) { + if !isDirSizeCached { + dirSizeMB, err := calculateDirSizeInMb(outputDirAbsolutePath) + if err != nil { + return 0, err + } + cachedDirSizeMB = dirSizeMB + isDirSizeCached = true + } + return cachedDirSizeMB, nil } func calculateDirSizeInMb(path string) (float64, error) { @@ -1056,7 +1190,8 @@ func filterByTargetReposPath(cloneTargets []scm.Repo) []scm.Repo { return cloneTargets } -func pruneRepos(cloneTargets []scm.Repo) { +func pruneRepos(cloneTargets []scm.Repo) int { + count := 0 colorlog.PrintInfo("\nScanning for local clones that have been removed on remote...") files, err := os.ReadDir(outputDirAbsolutePath) @@ -1080,6 +1215,7 @@ func pruneRepos(cloneTargets []scm.Repo) { colorlog.PrintSubtleInfo( fmt.Sprintf("Deleting %s", filepath.Join(outputDirAbsolutePath, f.Name()))) err = os.RemoveAll(filepath.Join(outputDirAbsolutePath, f.Name())) + count++ if err != nil { log.Fatal(err) } @@ -1088,6 +1224,8 @@ func pruneRepos(cloneTargets []scm.Repo) { } } } + + return count } func printCloneStatsMessage(cloneCount, pulledCount, updateRemoteCount, newCommits int) { @@ -1245,6 +1383,9 @@ func PrintConfigs() { } colorlog.PrintInfo("* Config Used : " + os.Getenv("GHORG_CONFIG")) + if os.Getenv("GHORG_STATS_ENABLED") == "true" { + colorlog.PrintInfo("* Stats Enabled : " + os.Getenv("GHORG_STATS_ENABLED")) + } colorlog.PrintInfo("* Ghorg version : " + GetVersion()) colorlog.PrintInfo("*************************************") diff --git a/cmd/root.go b/cmd/root.go index c60bd0d..d892673 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -67,6 +67,7 @@ var ( noToken bool quietMode bool noDirSize bool + ghorgStatsEnabled bool args []string cloneErrors []string cloneInfos []string @@ -162,6 +163,8 @@ func getOrSetDefaults(envVar string) { os.Setenv(envVar, "25") case "GHORG_QUIET": os.Setenv(envVar, "false") + case "GHORG_STATS_ENABLED": + os.Setenv(envVar, "false") case "GHORG_EXIT_CODE_ON_CLONE_INFOS": os.Setenv(envVar, "0") case "GHORG_EXIT_CODE_ON_CLONE_ISSUES": @@ -249,6 +252,7 @@ func InitConfig() { getOrSetDefaults("GHORG_INCLUDE_SUBMODULES") getOrSetDefaults("GHORG_EXIT_CODE_ON_CLONE_INFOS") getOrSetDefaults("GHORG_EXIT_CODE_ON_CLONE_ISSUES") + getOrSetDefaults("GHORG_STATS_ENABLED") // Optionally set getOrSetDefaults("GHORG_TARGET_REPOS_PATH") getOrSetDefaults("GHORG_CLONE_DEPTH") @@ -321,6 +325,7 @@ func init() { cloneCmd.Flags().BoolVar(&backup, "backup", false, "GHORG_BACKUP - Backup mode, clone as mirror, no working copy (ignores branch parameter)") cloneCmd.Flags().BoolVar(&quietMode, "quiet", false, "GHORG_QUIET - Emit critical output only") cloneCmd.Flags().BoolVar(&includeSubmodules, "include-submodules", false, "GHORG_INCLUDE_SUBMODULES - Include submodules in all clone and pull operations.") + cloneCmd.Flags().BoolVar(&ghorgStatsEnabled, "stats-enabled", false, "GHORG_STATS_ENABLED - Creates a CSV in the GHORG_ABSOLUTE_PATH_TO_CLONE_TO called _ghorg_stats.csv with info about each clone. This allows you to track clone data over time such as number of commits and size in megabytes of the clone directory.") cloneCmd.Flags().StringVarP(&baseURL, "base-url", "", "", "GHORG_SCM_BASE_URL - Change SCM base url, for on self hosted instances (currently gitlab, gitea and github (use format of https://git.mydomain.com/api/v3))") cloneCmd.Flags().StringVarP(&concurrency, "concurrency", "", "", "GHORG_CONCURRENCY - Max goroutines to spin up while cloning (default 25)") cloneCmd.Flags().StringVarP(&cloneDepth, "clone-depth", "", "", "GHORG_CLONE_DEPTH - Create a shallow clone with a history truncated to the specified number of commits") diff --git a/sample-conf.yaml b/sample-conf.yaml index 0beb4f3..c0be332 100644 --- a/sample-conf.yaml +++ b/sample-conf.yaml @@ -57,7 +57,7 @@ GHORG_PRUNE: false GHORG_PRUNE_NO_CONFIRM: false # Color output (enabled, disabled) -# flag (--color) eg: --color=disabled +# flag( --color) eg: --color=enabled eg: --color=disabled GHORG_COLOR: disabled # Skip archived repos, currently github/gitlab/gitea only @@ -73,7 +73,7 @@ GHORG_SKIP_FORKS: false GHORG_BACKUP: false # Max goroutines created while cloning -# flag (--concurrency) +# flag (--concurrency) eg: --concurrency=1 GHORG_CONCURRENCY: 25 # Create a shallow clone with a history truncated to the specified number of commits @@ -153,9 +153,17 @@ GHORG_NO_TOKEN: false # Skips the calculation of the output directory size at the end of a clone operation. # This can save time, especially when cloning a large number of repositories. +# This is enabled by default # flag (--no-dir-size) GHORG_NO_DIR_SIZE: false +# Creates a CSV in the GHORG_ABSOLUTE_PATH_TO_CLONE_TO called _ghorg_stats.csv with info about each clone +# This allows you to track clone data over time such as number of commits and size in megabytes of the clone directory. +# If the header of the CSV changes, it will create a new file of _ghorg_stats_new_header_${sha256HashOfHeader}.csv this is how we will handle breaking changes to the CSV over time +# More information at https://github.com/gabrie30/ghorg?tab=readme-ov-file#tracking_clone_data_over_time +# flag (--stats-enabled) +GHORG_STATS_ENABLED: false + # Specifies the location of your ghorg conf.yaml, allowing you to have many configuration files, or none at all # default: ghorg looks in $HOME/.config/ghorg/conf.yaml, if not set in that location nor as a commandline flag, ghorg will use all default values # NOTE: this cannot be set in the configuration file. Its supported through CLI flag and ENV var only. @@ -166,6 +174,7 @@ GHORG_NO_DIR_SIZE: false # NOTE: This setting cannot be configured through the configuration file or the CLI. It can only be set as an environment variable. # For example: GHORG_DEBUG=true ghorg clone kubernetes # When using this env concurrency is set to a value of 1, this behavior can be overwritten for debugging concurrency issues by setting GHORG_CONCURRENCY_DEBUG=true in addition to setting GHORG_DEBUG=true +# Note when this is enabled the api key used will be printed to stdout # GHORG_DEBUG: # +-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+