diff --git a/.gitignore b/.gitignore index c67cffa1c4375c4da852ca8b71f9b2690ad2580b..3b9086c7187dc78f9ce2dc1b137bf55ce5e963cc 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ *.ipr *.iml *.iws +*.pyc .idea/ .idea_modules/ sbt/*.jar @@ -49,6 +50,8 @@ dependency-reduced-pom.xml checkpoint derby.log dist/ +dev/create-release/*txt +dev/create-release/*new spark-*-bin-*.tgz unit-tests.log /lib/ diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py index 99c29ef9ff8b6ba116b06fe2f9ff37cda37cac79..a3b78a3eac6d098afe9714ca6eb73a58f619bef4 100755 --- a/dev/create-release/generate-contributors.py +++ b/dev/create-release/generate-contributors.py @@ -26,8 +26,6 @@ from releaseutils import * # You must set the following before use! JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") -JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None) -JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None) START_COMMIT = os.environ.get("START_COMMIT", "37b100") END_COMMIT = os.environ.get("END_COMMIT", "3693ae") @@ -40,8 +38,6 @@ if not START_COMMIT or not END_COMMIT: END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ") # Verify provided arguments -if not JIRA_USERNAME: sys.exit("JIRA_USERNAME must be provided") -if not JIRA_PASSWORD: sys.exit("JIRA_PASSWORD must be provided") start_commit_line = get_one_line(START_COMMIT) end_commit_line = get_one_line(END_COMMIT) num_commits = num_commits_in_range(START_COMMIT, END_COMMIT) @@ -60,14 +56,6 @@ if response.lower() != "y" and response: sys.exit("Ok, exiting") print "==================================================================================\n" -# Setup JIRA and github clients. We use two JIRA clients, one with authentication -# and one without, because authentication is slow and required only when we query -# JIRA user details but not Spark issues -jira_options = { "server": JIRA_API_BASE } -jira_client = JIRA(options = jira_options) -jira_client_auth = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) -github_client = Github() - # Find all commits within this range print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT) commits = get_one_line_commits(START_COMMIT, END_COMMIT) @@ -105,13 +93,17 @@ if releases or reverts or nojiras: if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts) if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras) print "==================== Warning: the above commits will be ignored ==================\n" -response = raw_input("%d commits left to process. Ok to proceed? [y/N] " % len(filtered_commits)) -if response.lower() != "y": +response = raw_input("%d commits left to process. Ok to proceed? [Y/n] " % len(filtered_commits)) +if response.lower() != "y" and response: sys.exit("Ok, exiting.") # Keep track of warnings to tell the user at the end warnings = [] +# Mapping from the invalid author name to its associated JIRA issues +# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471") +invalid_authors = {} + # Populate a map that groups issues and components by author # It takes the form: Author name -> { Contribution type -> Spark components } # For instance, @@ -127,16 +119,23 @@ warnings = [] # } # author_info = {} +jira_options = { "server": JIRA_API_BASE } +jira_client = JIRA(options = jira_options) print "\n=========================== Compiling contributor list ===========================" for commit in filtered_commits: commit_hash = re.findall("^[a-z0-9]+", commit)[0] issues = re.findall("SPARK-[0-9]+", commit.upper()) - # Translate the author in case the github username is not an actual name - # Also guard against any special characters used in the name - # Note the JIRA client we use here must have authentication enabled author = get_author(commit_hash) - author = unidecode.unidecode(unicode(author, "UTF-8")) - author = translate_author(author, github_client, jira_client_auth, warnings) + author = unidecode.unidecode(unicode(author, "UTF-8")).strip() # guard against special characters + # If the author name is invalid, keep track of it along + # with all associated issues so we can translate it later + if is_valid_author(author): + author = capitalize_author(author) + else: + if author not in invalid_authors: + invalid_authors[author] = set() + for issue in issues: + invalid_authors[author].add(issue) date = get_date(commit_hash) # Parse components from the commit message, if any commit_components = find_components(commit, commit_hash) @@ -147,7 +146,7 @@ for commit in filtered_commits: author_info[author] = {} if issue_type not in author_info[author]: author_info[author][issue_type] = set() - for component in all_components: + for component in components: author_info[author][issue_type].add(component) # Find issues and components associated with this commit for issue in issues: @@ -168,7 +167,6 @@ print "========================================================================= # Each line takes the format "Author name - semi-colon delimited contributions" # e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core # e.g. Tathagata Das - Bug fixes and new features in Streaming -contributors_file_name = "contributors.txt" contributors_file = open(contributors_file_name, "w") authors = author_info.keys() authors.sort() @@ -192,11 +190,23 @@ for author in authors: # Do not use python's capitalize() on the whole string to preserve case assert contribution contribution = contribution[0].capitalize() + contribution[1:] + # If the author name is invalid, use an intermediate format that + # can be translated through translate-contributors.py later + # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672 + if author in invalid_authors and invalid_authors[author]: + author = author + "/" + "/".join(invalid_authors[author]) line = "%s - %s" % (author, contribution) contributors_file.write(line + "\n") contributors_file.close() print "Contributors list is successfully written to %s!" % contributors_file_name +# Prompt the user to translate author names if necessary +if invalid_authors: + warnings.append("Found the following invalid authors:") + for a in invalid_authors: + warnings.append("\t%s" % a) + warnings.append("Please run './translate-contributors.py' to translate them.") + # Log any warnings encountered in the process if warnings: print "\n============ Warnings encountered while creating the contributor list ============" diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index 0d6830b11dc7302e78a958fcb3762daeb9e86a37..76a10c32886d4f29a1fce9a813c200f447a9bb5f 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -44,6 +44,9 @@ except ImportError: print "Install using 'sudo pip install unidecode'" sys.exit(-1) +# Contributors list file name +contributors_file_name = "contributors.txt" + # Utility functions run git commands (written with Git 1.8.5) def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0] def get_author(commit_hash): @@ -69,7 +72,8 @@ known_issue_types = { "build": "build fixes", "improvement": "improvements", "new feature": "new features", - "documentation": "documentation" + "documentation": "documentation", + "test": "test" } # Maintain a mapping for translating component names when creating the release notes @@ -182,36 +186,3 @@ def capitalize_author(author): words = [w[0].capitalize() + w[1:] for w in words if w] return " ".join(words) -# Maintain a mapping of translated author names as a cache -translated_authors = {} - -# Format the given author in a format appropriate for the contributors list. -# If the author is not an actual name, search github and JIRA for potential -# replacements and log all candidates as a warning. -def translate_author(github_author, github_client, jira_client, warnings): - if is_valid_author(github_author): - return capitalize_author(github_author) - # If the translated author is already cached, just return it - if github_author in translated_authors: - return translated_authors[github_author] - # Otherwise, author name is not found, so we need to search for an alternative name - candidates = set() - github_name = get_github_name(github_author, github_client) - jira_name = get_jira_name(github_author, jira_client) - if is_valid_author(github_name): github_name = capitalize_author(github_name) - if is_valid_author(jira_name): jira_name = capitalize_author(jira_name) - if github_name: candidates.add(github_name) - if jira_name: candidates.add(jira_name) - # Only use the github name as a replacement automatically - # The JIRA name may not make sense because it can belong to someone else - if is_valid_author(github_name): - candidates_message = " (another candidate is %s)" % jira_name if jira_name else "" - warnings.append("Replacing github user %s with %s%s" % (github_author, github_name, candidates_message)) - translated_authors[github_name] = github_name - return translated_authors[github_name] - # No direct replacement, so return the original author and list any candidates found - candidates_message = " (candidates: %s)" % nice_join(candidates) if candidates else "" - warnings.append("Unable to find a replacement for github user %s%s" % (github_author, candidates_message)) - translated_authors[github_author] = github_author - return translated_authors[github_author] - diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py new file mode 100755 index 0000000000000000000000000000000000000000..ef4625b003cb6b1081e460b95fe803c1bb70ddb7 --- /dev/null +++ b/dev/create-release/translate-contributors.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script translates invalid authors in the contributors list generated +# by generate-contributors.py. When the script encounters an author name that +# is considered invalid, it searches Github and JIRA in an attempt to search +# for replacements. This tool runs in two modes: +# +# (1) Interactive mode: For each invalid author name, this script presents +# all candidate replacements to the user and awaits user response. In this +# mode, the user may also input a custom name. This is the default. +# +# (2) Non-interactive mode: For each invalid author name, this script replaces +# the name with the first valid candidate it can find. If there is none, it +# uses the original name. This can be enabled through the --non-interactive flag. + +import os +import sys + +from releaseutils import * + +# You must set the following before use! +JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") +JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None) +JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None) +if not JIRA_USERNAME or not JIRA_PASSWORD: + sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set") + +# Write new contributors list to <old_file_name>.new +if not os.path.isfile(contributors_file_name): + print "Contributors file %s does not exist!" % contributors_file_name + print "Have you run ./generate-contributors.py yet?" + sys.exit(1) +contributors_file = open(contributors_file_name, "r") +new_contributors_file_name = contributors_file_name + ".new" +new_contributors_file = open(new_contributors_file_name, "w") +warnings = [] + +# In non-interactive mode, this script will choose the first replacement that is valid +INTERACTIVE_MODE = True +if len(sys.argv) > 1: + options = set(sys.argv[1:]) + if "--non-interactive" in options: + INTERACTIVE_MODE = False +if INTERACTIVE_MODE: + print "Running in interactive mode. To disable this, provide the --non-interactive flag." + +# Setup Github and JIRA clients +jira_options = { "server": JIRA_API_BASE } +jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) +github_client = Github() + +# Generate candidates for the given author. This should only be called if the given author +# name does not represent a full name as this operation is somewhat expensive. Under the +# hood, it makes several calls to the Github and JIRA API servers to find the candidates. +# +# This returns a list of (candidate name, source) 2-tuples. E.g. +# [ +# (NOT_FOUND, "No full name found for Github user andrewor14"), +# ("Andrew Or", "Full name of JIRA user andrewor14"), +# ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"), +# ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"), +# (NOT_FOUND, "No assignee found for SPARK-1763") +# ] +NOT_FOUND = "Not found" +def generate_candidates(author, issues): + candidates = [] + # First check for full name of Github user + github_name = get_github_name(new_author, github_client) + if github_name: + candidates.append((github_name, "Full name of Github user %s" % new_author)) + else: + candidates.append((NOT_FOUND, "No full name found for Github user %s" % new_author)) + # Then do the same for JIRA user + jira_name = get_jira_name(new_author, jira_client) + if jira_name: + candidates.append((jira_name, "Full name of JIRA user %s" % new_author)) + else: + candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % new_author)) + # Then do the same for the assignee of each of the associated JIRAs + # Note that a given issue may not have an assignee, or the assignee may not have a full name + for issue in issues: + jira_issue = jira_client.issue(issue) + jira_assignee = jira_issue.fields.assignee + if jira_assignee: + user_name = jira_assignee.name + display_name = jira_assignee.displayName + if display_name: + candidates.append((display_name, "Full name of %s assignee %s" % (issue, user_name))) + else: + candidates.append((NOT_FOUND, "No full name found for %s assignee %" % (issue, user_name))) + else: + candidates.append((NOT_FOUND, "No assignee found for %s" % issue)) + # Guard against special characters in candidate names + # Note that the candidate name may already be in unicode (JIRA returns this) + for i, (candidate, source) in enumerate(candidates): + try: + candidate = unicode(candidate, "UTF-8") + except TypeError: + # already in unicode + pass + candidate = unidecode.unidecode(candidate).strip() + candidates[i] = (candidate, source) + return candidates + +# Translate each invalid author by searching for possible candidates from Github and JIRA +# In interactive mode, this script presents the user with a list of choices and have the user +# select from this list. Additionally, the user may also choose to enter a custom name. +# In non-interactive mode, this script picks the first valid author name from the candidates +# If no such name exists, the original name is used (without the JIRA numbers). +print "\n========================== Translating contributor list ==========================" +for line in contributors_file: + author = line.split(" - ")[0] + print "Processing author %s" % author + if not author: + print " ERROR: Expected the following format <author> - <contributions>" + print " ERROR: Actual = %s" % line + if not is_valid_author(author): + new_author = author.split("/")[0] + issues = author.split("/")[1:] + candidates = generate_candidates(new_author, issues) + # Print out potential replacement candidates along with the sources, e.g. + # [X] No full name found for Github user andrewor14 + # [0] Andrew Or - Full name of JIRA user andrewor14 + # [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14 + # [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14 + # [X] No assignee found for SPARK-1763 + # [3] Custom + candidate_names = [] + for candidate, source in candidates: + if candidate == NOT_FOUND: + print " [X] %s" % source + else: + index = len(candidate_names) + candidate_names.append(candidate) + print " [%d] %s - %s" % (index, candidate, source) + custom_index = len(candidate_names) + # In interactive mode, additionally provide "custom" option and await user response + if INTERACTIVE_MODE: + print " [%d] Custom" % custom_index + response = raw_input(" Your choice: ") + while not response.isdigit() or int(response) > custom_index: + response = raw_input(" Please enter an integer between 0 and %d: " % custom_index) + response = int(response) + if response == custom_index: + new_author = raw_input(" Please type a custom name for this author: ") + else: + new_author = candidate_names[response] + # In non-interactive mode, just pick the first candidate + else: + valid_candidate_names = [name for name, _ in candidates\ + if is_valid_author(name) and name != NOT_FOUND] + if valid_candidate_names: + new_author = valid_candidate_names[0] + # Finally, capitalize the author and replace the original one with it + # If the final replacement is still invalid, log a warning + if is_valid_author(new_author): + new_author = capitalize_author(new_author) + else: + warnings.append("Unable to find a valid name %s for author %s" % (new_author, author)) + print " * Replacing %s with %s" % (author, new_author) + line = line.replace(author, new_author) + new_contributors_file.write(line) +print "==================================================================================\n" +contributors_file.close() +new_contributors_file.close() + +print "Translated contributors list successfully written to %s!" % new_contributors_file_name + +# Log any warnings encountered in the process +if warnings: + print "\n========== Warnings encountered while translating the contributor list ===========" + for w in warnings: print w + print "Please manually correct these in the final contributors list at %s." % new_contributors_file_name + print "==================================================================================\n" +