diff --git a/README.md b/README.md index 053bfff..da9da09 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # GitHub Reporting Tool 👨‍💻 📊 🐍 + > Create detailed and summary CSV reports of activity by a GitHub user, using the GraphQL API [![GitHub tag](https://img.shields.io/github/tag/MichaelCurrin/github-reporting-py)](https://github.com/MichaelCurrin/github-reporting-py/tags/) @@ -7,6 +8,20 @@ [![Made with Python](https://img.shields.io/badge/Python->=3.9-blue?logo=python&logoColor=white)](https://python.org) [![API - GitHub GraphQL](https://img.shields.io/badge/GitHub_API-V4_GraphQL-blue?logo=github)](https://graphql.github.io/) +## Quick start + +Say you just want to get a CSV file of all your commits on the deafult branch of a repo. + +1. `cd ghgql` +2. Set your GitHub access token in `etc/app.local.yml` +3. Run `python config.py` +4. Run your commit query via + +```bash +python repo_commits.py REPO_OWNER REPO_NAME COMMITTER -o OUTPUT_DIR -s START_DATE -e END_DATE +``` + +This app is currently limited to querying commits from the default branch of a repo. ## Purpose @@ -17,7 +32,6 @@ This tool was created to: - Act as a **wrapper** on requests and formatting, so you can focus on writing or using a query and getting the data out as a CSV. - Act an easy CLI for anyone - without caring about what language the tool is implemented in (other than installing initially). - ## Documentation
@@ -26,12 +40,10 @@ This tool was created to:
- ## Contributing If you want to make the project better, see the [contribution guidelines](/CONTRIBUTING.md). - ## License Released under [MIT](/LICENSE) by [@MichaelCurrin](https://github.com/MichaelCurrin/). diff --git a/ghgql/demo/variables.py b/ghgql/demo/variables.py index 9fbc580..a74ce72 100644 --- a/ghgql/demo/variables.py +++ b/ghgql/demo/variables.py @@ -64,7 +64,7 @@ headers = {"Authorization": f"token {config.ACCESS_TOKEN}"} # Send the POST request. -resp = requests.post(config.BASE_URL, json=payload, headers=headers) +resp = requests.post(config.BASE_URL, json=payload, headers=headers, timeout=10) # Pretty print the output. prettified = json.dumps(resp.json(), indent=4) diff --git a/ghgql/lib/__init__.py b/ghgql/lib/__init__.py index 602d692..f4b1baf 100644 --- a/ghgql/lib/__init__.py +++ b/ghgql/lib/__init__.py @@ -30,12 +30,16 @@ HEADERS = {"Authorization": f"token {config.ACCESS_TOKEN}"} MAX_ATTEMPTS = 3 -dict_of_str = dict[str, str] -list_of_str = list[str] +DictOfStr = dict[str, str] +ListOfStr = list[str] -def _request(url: str, payload: dict_of_str, headers: dict_of_str): - resp = requests.post(url, json=payload, headers=headers) +def _request(url: str, payload: DictOfStr, headers: DictOfStr): + resp = requests.post( + url, + json=payload, + headers=headers, + timeout=10) resp_json = resp.json() resp_msg = resp_json.get("message", None) @@ -66,7 +70,7 @@ def _request(url: str, payload: dict_of_str, headers: dict_of_str): return resp_json -def fetch_github_data(query: str, variables=None) -> dict_of_str: +def fetch_github_data(query: str, variables={}) -> DictOfStr: """ Get data from GitHub API using given parameters. @@ -74,9 +78,6 @@ def fetch_github_data(query: str, variables=None) -> dict_of_str: might still contain some data. A 404 will not contain the data or errors keys. """ - if not variables: - variables = {} - payload = { "query": query, "variables": variables, @@ -99,9 +100,9 @@ def fetch_github_data(query: str, variables=None) -> dict_of_str: # TODO: Sleep for set time or perhaps short time if too frequent # between requests. - seconds = 10 + seconds = 1 text.eprint(f"Sleeping {seconds} s...") - sleep(seconds * 1000) + sleep(seconds) text.eprint("Retrying...") else: break @@ -113,7 +114,7 @@ def read_file(path: Path): """ TODO: Refactor to use Path().read() instead. """ - with open(path) as f_in: + with open(path, encoding='utf8') as f_in: file_text = f_in.read() return file_text @@ -131,7 +132,7 @@ def write_file(content, path: Path): print("Writing") print(f" - path: {path}") - with open(path, "w") as f_out: + with open(path, "w", encoding='utf8') as f_out: f_out.writelines(content) @@ -144,24 +145,24 @@ def read_template(path: Path): # TODO Rename to path. # TODO Refactor so the file only has to be read once for a set of paged queries. -def query_by_filename(path: Path, variables=None): - if not variables: - variables = {} - +def query_by_filename(path: Path, variables={}): + """ + Use query file `path` and `variables` to make a query. + """ query = read_file(path) - resp = fetch_github_data(query, variables) - return resp + return fetch_github_data(query, variables) def read_csv(path: Path): - with open(path) as f_in: - reader = csv.DictReader(f_in) - - return list(reader) + """ + Read a CSV file. + """ + with open(path, "r", encoding='utf8') as f_in: + return list(csv.DictReader(f_in)) -def write_csv(path: Path, rows: list[dict_of_str], append=False) -> None: +def write_csv(path: Path, rows: list[DictOfStr], append=False) -> None: """ Write a CSV file to a path with given rows and header from first row. @@ -178,10 +179,9 @@ def write_csv(path: Path, rows: list[dict_of_str], append=False) -> None: is_new_file = not path.exists() mode = "a" if append else "w" - fieldnames = list(rows[0].keys()) - with open(path, mode) as f_out: + with open(path, mode, encoding='utf8') as f_out: writer = csv.DictWriter(f_out, fieldnames) if is_new_file or not append: @@ -195,7 +195,7 @@ def write_csv(path: Path, rows: list[dict_of_str], append=False) -> None: print() -def process_variables(args: list_of_str) -> dict_of_str: +def process_variables(args: ListOfStr) -> DictOfStr: """ Process command-line arguments containing a filename and key-value pairs. """ @@ -222,7 +222,7 @@ def process_variables(args: list_of_str) -> dict_of_str: return {} -def process_args(args: list_of_str): +def process_args(args: ListOfStr): """ Process command-line arguments containing a filename and key-value pairs. diff --git a/ghgql/lib/git.py b/ghgql/lib/git.py index 485538c..435f5df 100644 --- a/ghgql/lib/git.py +++ b/ghgql/lib/git.py @@ -9,7 +9,7 @@ # TODO This could be better as a class - then the structure can be reused # and used for type checking as a whole or getting fields on the object. # Use init to process `value`. -def parse_commit(value: dict): +def parse_commit(value: dict, verbose=False): """ Extract relevant fields from nested data and return as a flat dict. """ @@ -21,23 +21,31 @@ def parse_commit(value: dict): committer_login = committer["login"] if committer is not None else None commit_date = time.as_date(value["committedDate"]) - return dict( - commit_id=value["abbreviatedOid"], - author_date=author_date, - author_login=author_login, - committed_date=commit_date, - committer_login=committer_login, - changed_files=value["changedFiles"], - additions=value["additions"], - deletions=value["deletions"], - message=value["message"], - ) - - -def prepare_row(commit: dict, repo_name: str, branch_name: str): + if verbose: + return dict( + commit_id=value["abbreviatedOid"], + author_date=author_date, + author_login=author_login, + committed_date=commit_date, + committer_login=committer_login, + changed_files=value["changedFiles"], + additions=value["additions"], + deletions=value["deletions"], + message=value["message"], + ) + else: + return dict( + commit_id=value["abbreviatedOid"], + committed_date=commit_date, + committer_login=committer_login, + message=value["message"], + ) + + +def prepare_row(commit: dict, repo_name: str, branch_name: str, verbose=False): """ Convert commit metadata to a dict for writing to a CSV. """ - parsed_commit_data = parse_commit(commit) + parsed_commit_data = parse_commit(commit, verbose) return dict(repo_name=repo_name, branch_name=branch_name, **parsed_commit_data) diff --git a/ghgql/queries/repos/repo_commits_branch.gql b/ghgql/queries/repos/repo_commits_branch.gql new file mode 100644 index 0000000..1bc187f --- /dev/null +++ b/ghgql/queries/repos/repo_commits_branch.gql @@ -0,0 +1,39 @@ +# Get details of all commits for a single repo and optional date range, +# using paging to get all commits. +query CommitsForRepo($owner: String!, $repo_name: String!, $branch_name: String!, $since: GitTimestamp, $cursor: String) { + repository(owner: $owner, name: $repo_name) { + ref(qualifiedName: $branch_name) { + name + target { + ... on Commit { + history(since: $since, first: 100, after: $cursor) { + totalCount + pageInfo { + hasNextPage + endCursor + } + nodes { + abbreviatedOid + authoredDate + author { + user { + login + } + } + committedDate + committer { + user { + login + } + } + changedFiles + additions + deletions + message + } + } + } + } + } + } +} diff --git a/ghgql/repo_commits.py b/ghgql/repo_commits.py index 9b033a9..8e0cce3 100755 --- a/ghgql/repo_commits.py +++ b/ghgql/repo_commits.py @@ -3,7 +3,7 @@ Repo commits report application. Fetch all commits for a single given repo using paging. Accepts an optional -start date. +committer, output dir, start date, end date, and verbosity. """ import argparse import datetime @@ -14,53 +14,65 @@ import lib.git QUERY_PATH = Path("queries/repos/repo_commits.gql") -CSV_OUT_NAME = "repo-commits--{repo_name}--end-{end_date}--start-{start_date}.csv" +QUERY_PATH_BRANCH = Path("queries/repos/repo_commits_branch.gql") +CSV_OUT_NAME = "{owner}--{repo}--start-{start}--end-{end}.csv" +CSV_OUT_NAME_BRANCH = "{owner}--{repo}--{branch}--start-{start}--end-{end}.csv" -def parse(resp: dict): + +def parse(resp: dict, branch_name): """ Parse response data for the repo commits query. """ - branch = resp["repository"]["defaultBranchRef"] - + key = "ref" if branch_name else "defaultBranchRef" + branch = resp["repository"][key] branch_name = branch.get("name") - commit_history = branch["target"]["history"] total_commits = commit_history["totalCount"] commits = commit_history["nodes"] page_info = commit_history["pageInfo"] - cursor = page_info["endCursor"] if page_info["hasNextPage"] else None return branch_name, total_commits, commits, cursor -def process_response(resp: dict, repo_name: str): +def process_response(resp: dict, repo_name: str, branch_name: str, verbose=False): """ Format the response from a request for repo commits. """ - branch_name, total_commits, commits, cursor = parse(resp) - + branch_name, total_commits, commits, cursor = parse(resp, branch_name) processed_commits = [ - lib.git.prepare_row(c, repo_name, branch_name) for c in commits + lib.git.prepare_row(c, repo_name, branch_name, verbose) for c in commits ] return processed_commits, total_commits, cursor -def get_commits(owner: str, repo_name: str, start_date=None) -> list[dict]: +def get_commits( + owner: str, + repo_name: str, + branch_name=None, + committer=None, + start_date=None, + end_date=None, + verbose=False) -> list[dict]: """ Fetch all commits for a given repo and an optional start date. Uses paging if there is more than 1 page of 100 commits to fetch. Returns a list of zero or more dict objects with commit data. """ - print("/".join((owner, repo_name))) + if branch_name: + print("/".join((owner, repo_name, branch_name))) + else: + print("/".join((owner, repo_name))) since = lib.time.as_git_timestamp(start_date) if start_date else None + before = lib.time.as_git_timestamp(end_date) if end_date else None query_variables = dict( owner=owner, repo_name=repo_name, + branch_name=branch_name, since=since, ) @@ -70,37 +82,71 @@ def get_commits(owner: str, repo_name: str, start_date=None) -> list[dict]: while True: counter += 1 - resp = lib.query_by_filename(QUERY_PATH, query_variables) - commits, total_commits, cursor = process_response(resp, repo_name) + query_path = QUERY_PATH_BRANCH if branch_name else QUERY_PATH + resp = lib.query_by_filename(query_path, query_variables) + commits, _, cursor = process_response(resp, repo_name, branch_name, verbose) results.extend(commits) - if counter == 1: - print(f" - commits: {total_commits}") - print(f" - pages: {math.ceil(total_commits / 100)}") - print(f"Processed page: #{counter}") - if cursor: query_variables["cursor"] = cursor else: break + if committer: + results = list(filter( + lambda res: res["committer_login"] == committer, + results + )) + + if before: + before = lib.time.as_date(before) + results = list(filter( + lambda res: res["committed_date"] < before, + results + )) + + if counter == 1: + total_commits = len(results) + print(f" - commits: {total_commits}") + print(f" - pages: {math.ceil(total_commits / 100)}") + print(f"Processed page: #{counter}") + return results -def commits_to_csv(owner, repo_name, start_date=None): +def commits_to_csv( + owner, + repo_name, + branch_name=None, + committer=None, + output_dir=None, + start_date="START", + end_date=datetime.date.today(), + verbose=False): """ Write a CSV of all commits in a repo. Existing file will be overwritten. """ - filename = CSV_OUT_NAME.format( - repo_name=repo_name, - end_date=datetime.date.today(), - start_date=start_date if start_date else "INIT", + filename = CSV_OUT_NAME_BRANCH if branch_name else CSV_OUT_NAME + filename = filename.format( + owner=owner, + repo=repo_name, + branch=branch_name, + start=start_date, + end=end_date, + ) + output_dir = Path(output_dir) if output_dir else lib.VAR_DIR + path = output_dir / filename + repo_commits = get_commits( + owner, + repo_name, + branch_name, + committer, + start_date, + end_date, + verbose, ) - path = lib.VAR_DIR / filename - - repo_commits = get_commits(owner, repo_name, start_date) lib.write_csv(path, repo_commits, append=False) @@ -114,10 +160,29 @@ def main(): parser.add_argument( "owner", metavar="OWNER", + help="Owner of the repository." + ) + parser.add_argument( + "repo", + metavar="REPO", + help="Repository name." + ) + parser.add_argument( + "committer", + metavar="COMMITTER", + help="Username of the commit author." + ) + parser.add_argument( + "-b", + "--branch", + metavar="BRANCH", + help="Specific branch to pull commits from.", ) parser.add_argument( - "repo_name", - metavar="REPO_NAME", + "-o", + "--output-dir", + metavar="DIR", + help="Directory in which to write the csv file.", ) parser.add_argument( "-s", @@ -126,12 +191,29 @@ def main(): help="Optionally filter to commits from this date onwards." " Format: 'YYYY-MM-DD'.", ) + parser.add_argument( + "-e", + "--end", + metavar="DATE", + help="Optionally filter to commits strictly before this date." + " Format: 'YYYY-MM-DD'.", + ) + parser.add_argument( + "-v", + "--verbose", + help="Verbose commit output.", + ) args = parser.parse_args() commits_to_csv( args.owner, - args.repo_name, + args.repo, + args.branch, + args.committer, + args.output_dir, args.start, + args.end, + args.verbose, ) diff --git a/ghgql/repos_and_commits.py b/ghgql/repos_and_commits.py index a5ce5d3..2ee672a 100755 --- a/ghgql/repos_and_commits.py +++ b/ghgql/repos_and_commits.py @@ -46,7 +46,7 @@ def render(template, owner, repos, since, dry_run=False): """ Prepare and return template for repo commits query. """ - return template.render(owner=owner, repos=repos, since=since, dry_run=dry_run) + return template.render(owner, repos, since, dry_run) def process_results(results): @@ -63,10 +63,9 @@ def process_results(results): for repo_data in results.values(): name = repo_data["name"] branch = repo_data["defaultBranchRef"] - branch_name = branch.get("name") - raw_commits = branch["target"]["history"]["nodes"] + if raw_commits: for c in raw_commits: parsed_commit_data = lib.git.parse_commit(c) @@ -99,9 +98,12 @@ def get_results(template, owner, repos, since, dry_run): def write(path, rows): + """ + Write `rows` to the file `path`. + """ wrote_header = False - with open(path, "w") as f_out: + with open(path, "w", encoding='utf8') as f_out: fieldnames = ( "repo_name", "branch_name", diff --git a/ghgql/repos_recent_commits.py b/ghgql/repos_recent_commits.py index 108fb31..f10f906 100755 --- a/ghgql/repos_recent_commits.py +++ b/ghgql/repos_recent_commits.py @@ -9,7 +9,7 @@ # TODO Since param # TODO: Refactor - move this function to lib. -def parse_commit(value): +def parse_commit(value, verbose=False): """ Extract fields from nested data as returned from API and return as flat dict. """ @@ -21,15 +21,23 @@ def parse_commit(value): committer_login = committer["login"] if committer is not None else None commit_date = lib.time.as_date(value["committedDate"]) - return dict( + if verbose: + return dict( + commit_id=value["abbreviatedOid"], + author_date=author_date, + author_login=author_login, + committed_date=commit_date, + committer_login=committer_login, + changed_files=value["changedFiles"], + additions=value["additions"], + deletions=value["deletions"], + message=value["message"], + ) + else: + return dict( commit_id=value["abbreviatedOid"], - author_date=author_date, - author_login=author_login, committed_date=commit_date, committer_login=committer_login, - changed_files=value["changedFiles"], - additions=value["additions"], - deletions=value["deletions"], message=value["message"], )