Python реализация на github - PullRequest
0 голосов
/ 08 мая 2020

Как я могу исправить следующую ошибку, когда комментарии из слитых коммитов не загружаются? Я думаю, что есть проблема с REST на GraphQL, но я не знаю, как ее исправить. Следующий скрипт используется для: получения всех количественно определенных действий из набора репозиториев GitHub. Список репозиториев для поиска находится в файле repos.lst. Аутентификация на GitHub также хранится в отдельном файле с именем github.token, содержащем имя пользователя и пароль / токен доступа в двух отдельных строках. Количественно оцениваемые действия включают объединенные запросы на вытягивание, закрытые проблемы (кроме тех, которые явно удалены из списка за неконструктивность) и комментарии. Pull-реквесты: перечислены только закрытые pull-запросы и определен их статус слияния, наконец, объединенные пул-реквесты сохраняются в файле JSON с записями, содержащими ссылочный номер, репозиторий, заголовок, автора и URL запроса на вытягивание. Проблемы: перечислены только закрытые проблемы, запросы на вытягивание (которые также рассматриваются как проблемы API GitHub) удалены из них, вопросы, внесенные в черный список в файле blacklist.lst, также удаляются из списка, и, наконец, оставшиеся проблемы сохраняются в файле JSON с записями, содержащими ссылочный номер, репозиторий, заголовок, автора и URL-адрес выпуска. Комментарии: Комментарии из коммитов, из проблем и из запросов на вытягивание перечислены и сохранены в файле JSON с записями, содержащими автора, идентификатор комментария, репозиторий, содержимое комментария и URL-адрес комментария. Комментарии к задачам и комментарии к запросу на вытягивание также будут содержать справочный номер задачи, соответственно, запроса на вытягивание, при этом проблемы дополнительно имеют также первоначального автора проблемы, в то время как комментарии коммита будут содержать SHA1 фиксации.

#!/usr/bin/env python3

# requests library is used to access GitHub API
import requests
import json
import re
import sys
import os.path
import bisect
import getpass
from itertools import chain
import repos

year = repos.get_year()
path = '../../{}/rezultate/'.format(year)

# read the list of repositories
repo_name = re.compile(r'([\w-]+)/([\w-]+)')
repos = [('etc-so', 'syllabus')]
with open(path + 'data/repos.lst', 'r') as f:
    for s in f:
        s = s.strip()
        # ignore empty lines
        if s and not s.startswith('#'):
            m = repo_name.fullmatch(s)
            if not m:
                print("Invalid repo name: " + s)
            repos.append((m.group(1), m.group(2)))

# read the list of students
students = []
with open(path + 'data/students.json', 'r') as f:
    students = [x['user'] for x in json.load(f)]

if not students:
    print("No students to check for")
    sys.exit(1)

# get the access token
if os.path.exists('github.token'):
    with open('github.token', 'r') as f:
        auth = (f.readline().strip(), f.readline().strip())
else:
    auth = (input("Enter GitHub username: "), getpass.getpass('Password: '))

# construct a labels list, so it can be added to the comments as well
issue_labels = {}

# get the persistent PR, issue and comment lists
if os.path.exists(path + 'data/pulls.json'):
    with open(path + 'data/pulls.json', 'r') as f:
        saved_pulls = dict((x['id'],(x['week'], x['value'], x['author']) + ((x['multi'],) if 'multi' in x else ())) for x in json.load(f))
else:
    saved_pulls = {}

if os.path.exists(path + 'data/issues.json'):
    with open(path + 'data/issues.json', 'r') as f:
        saved_issues = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
    saved_issues = {}

if os.path.exists(path + 'data/comments.json'):
    with open(path + 'data/comments.json', 'r') as f:
        saved_comments = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
    saved_comments = {}

current_week = 0
# if there were already some activity then move ahead to the last week
if saved_pulls or saved_issues or saved_comments:
    current_week = max(x[0] for x in chain(saved_pulls.values(),
                                           saved_issues.values(),
                                           saved_comments.values()))
# if there is a command line argument use it as current week
if len(sys.argv) == 2:
    if sys.argv[1].isdigit():
        current_week = int(sys.argv[1])
    # -n increments the current week if it can be obtained from the activity
    elif sys.argv[1] == '-n' and current_week != 0:
        current_week += 1
        print("Switching to week {}".format(current_week))
    elif sys.argv[1] == '?':
        print("Current week is {}".format(current_week))
        sys.exit(0)
    else:
        print("Invalid command line parameter")
        sys.exit(1)
elif len(sys.argv) > 2:
    print("Too many parameters")
    sys.exit(1)
# if no current week was obtained, start with week 1
if not current_week:
    current_week = 1

api_url = "https://api.github.com/"
get_url = "repos/{}/{}/{}"

c_params = {
    'state': 'closed', # get closed pull-requests/issues
    'sort': 'created',
    'direction': 'asc', # sort it in ascending order by their creation time
}

pr = []
pr_times = []
gi = []
comments = []
c_times = []
authors = {}

issue_comment_re = re.compile(r'.*/(pull|issues)/([0-9]+)#.*')
link_rel = re.compile(r'<([^>]+)>; rel="([\w]+)"(?:, )?')

request_count = 0

def github_get(get_code, **params):
    global request_count
    global raw_data
    res = []
    url = api_url + get_url.format(org, repo, get_code)
    while url:
        r = requests.get(url, auth=auth, params=params)
        if r.status_code != 200:
            print('Status code {} received'.format(r.status_code))
            print(r.content)
            sys.exit(1)
        links = dict((m[1],m[0]) for m in link_rel.findall(r.headers.get('link', '')))
        url = links.get('next')
        res.extend(r.json())
        request_count = r.headers['X-RateLimit-Remaining']
    return res

label_values = {
    'bug': 2,
    'enhancement': 2,
    'challenge': 2,
    'help wanted': 1,
    'invalid': -1,
    'duplicate': -2
}

def label_value(labels):
    # check predefined values
    values = [label_values[x] for x in labels if x in label_values]
    values += [x.count('★') for x in labels]
    if values:
        if min(values) < 0:
            return min(values)
        value = max(values)
    else:
        value = 0
    # add all white stars
    value += sum(x.count('☆') for x in labels)
    return value


def issue_value(value, issue_id=None):
    if issue_id:
        old_value = saved_issues.get(issue_id)
        if old_value and old_value[1] != value:
            print("Value change detected in issue {}".format(issue_id))
    return value


def pull_value(label, pull_id=None, ref=0):
    if label:
        value = label_value(label)
    else:
        value = None
        print("No label for pull-request {}".format(ref))
    if pull_id:
        old_value = saved_pulls.get(pull_id)
        if old_value and old_value[1] is not None and old_value[1] != value:
            print("Value change detected in pull-request {}".format(ref))
            if value is None:
                value = old_value[1]
    return value


ptotal = 0
itotal = 0
ctotal = 0

# pass through the repos
for org,repo in repos:
    print('Processing repo {}:'.format(repo), end=' ', flush=True)

    # get all the issues, do this first as it all includes the pull-requests
    # for which only here we can determine the labels
    issues = github_get('issues', state='all', sort='created', direction='asc')

    icount = 0;
    for i in issues:
        ref = int(i['number'])
        author = i['user']['login']
        authors[(repo, ref)] = author
        label = [x['name'] for x in i['labels']]
        issue_labels[(repo, ref)] = label
        if 'pull_request' not in i and author in students:
            gi.append(
                {
                    'id': i['id'],
                    'repo': repo,
                    'ref': ref,
                    'title': i['title'],
                    'url': i['html_url'],
                    'author': author,
                    'label': label,
                    'week' : saved_issues.get(i['id'], [current_week])[0],
                    'value' : issue_value(label_value(label), i['id'])
                })
            icount += 1


    # get the merged pull-requests
    pulls = github_get('pulls', state='closed')

    pcount = 0;
    #print(r.headers)
    for p in pulls:
        ref = int(p['number'])
        author = p['user']['login']
        label = issue_labels.get((repo, ref), [])
        if p['merged_at'] and author in students and (not label or label_value(label) >= 0):
            i = bisect.bisect_right(pr_times, p['merged_at'])
            pr_times.insert(i, p['merged_at'])

            # check authors of included commits
            if p['id'] in saved_pulls:
                author = saved_pulls[p['id']][2]
            else:
                pcommits = github_get('pulls/{}/commits'.format(ref))
                pc_authors = [author]

                for a in pcommits:
                    al = a['author'].get('login') if a['author'] else None
                    cl = a['committer'].get('login') if a['committer'] else None
                    if al == cl or cl == 'web-flow' or cl == 'kcs':
                        aa = al
                    else:
                        aa = ':'.join(x for x in [al, cl] if x)
                    if aa and aa not in pc_authors and aa != 'kcs':
                        pc_authors.append(aa)

                if len(pc_authors) != 1:
                    author = pc_authors

            pr.insert(i, {
            'id': p['id'],
                'repo': repo,
                'ref': ref,
                'title': p['title'],
                'url': p['html_url'],
                'label': label,
                'author': author,
                'week': saved_pulls.get(p['id'], [current_week])[0],
                'value': pull_value(label, p['id'], ref)
                })
            if len(saved_pulls.get(p['id'], [])) > 3:
                pr[i]['multi'] = saved_pulls[p['id']][3]
            pcount += 1


    # and now for the comments:
    # this is more troublesome as constructive comments must be selected
    # manually, so we are keeping persistent JSON file for the comments
    # holding a valid tag, newly downloaded comments have this tag unset
    # and they can be validated afterwards manually (or by script)

    ccount = 0;

    # 3 types of comments exist on GitHub: issue comments, review comments
    # and commit comments, all have to be handled separately
    ccomments = github_get('comments', per_page=50, sort='created')

    #with open('all_ccomments.json', 'w') as f:
    #   json.dump(ccomments, f, indent=4, sort_keys=True)

    for c in ccomments:
        author = c['user']['login']
        if author in students:
            i = bisect.bisect_right(c_times, c['created_at'])
            c_times.insert(i, c['created_at'])
            comments.insert(i,
                {
                    'id': c['id'],
                    'repo': repo,
                    'commit': c['commit_id'],
                    'msg': c['body'],
                    'url': c['html_url'],
                    'author': author,
                    'week': saved_comments.get(c['id'], [current_week])[0],
                    'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
                })
            ccount += 1

    icomments = github_get('issues/comments', per_page=50, sort='created')

    for c in icomments:
        author = c['user']['login']
        if author in students:
            url = c['html_url']
            m = issue_comment_re.fullmatch(url)
            if not m:
                print("Problem parsing issue url " + url)
                sys.exit(1)
            ref = int(m.group(2))
            i = bisect.bisect_right(c_times, c['created_at'])
            c_times.insert(i, c['created_at'])
            comments.insert(i,
                {
                    'id': c['id'],
                    'repo': repo,
                    'issue': ref,
                    'msg': c['body'],
                    'url': url,
                    'author': author,
                    'issue_author': authors[(repo, ref)],
                    'week': saved_comments.get(c['id'], [current_week])[0],
                    'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
                })
            if m.group(1) == 'issues' and (repo, ref) in issue_labels:
                comments[i]['label'] = issue_labels[(repo, ref)]
            ccount += 1

    pcomments = github_get('pulls/comments', per_page=50, sort='created')

    for c in pcomments:
        author = c['user']['login']
        if author in students:
            ref = int(c['pull_request_url'].rsplit('/', 1)[1])
            i = bisect.bisect_right(c_times, c['created_at'])
            c_times.insert(i, c['created_at'])
            comments.insert(i,
                {
                    'id': c['id'],
                    'repo': repo,
                    'pull': ref,
                    'msg': c['body'],
                    'url': c['html_url'],
                    'author': author,
                    'week': saved_comments.get(c['id'], [current_week])[0],
                    'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
                })
            ccount += 1

    print('found {} merged pull-requests, {} issues and {} comments'.format(pcount, icount, ccount))
    ptotal += pcount
    itotal += icount
    ctotal += ccount

with open(path + 'data/pulls.json', 'w') as f:
    json.dump(pr, f, indent=4, sort_keys=True, ensure_ascii=False)

with open(path + 'data/issues.json', 'w') as f:
    json.dump(gi, f, indent=4, sort_keys=True, ensure_ascii=False)

with open(path + 'data/comments.json', 'w') as f:
    json.dump(comments, f, indent=4, sort_keys=True, ensure_ascii=False)

print("Total: {} merged pull-requests, {} issues and {} comments [{} contributions]".format(
    ptotal, itotal, ctotal, ptotal + itotal + ctotal))
print("Remaining request count: {}".format(request_count))

repo_key = dict((r[1],i) for i,r in enumerate(repos))
def sort_repos(x):
    '''Sort the repos in a blacklist with (repo,ref) structure in the order
       as they appear in the repos list.
       If repo is not in the list then put them afterwards
    '''
    if x[0] in repo_key:
        return (repo_key[x[0]],x[1])
    else:
        return (len(repos),) + x
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...