Как я могу исправить следующую ошибку, когда комментарии из слитых коммитов не загружаются? Я думаю, что есть проблема с REST на GraphQL, но я не знаю, как ее исправить. Следующий скрипт используется для: получения всех количественно определенных действий из набора репозиториев GitHub. Список репозиториев для поиска находится в файле repos.lst
. Аутентификация на GitHub также хранится в отдельном файле с именем github.token
, содержащем имя пользователя и пароль / токен доступа в двух отдельных строках. Количественно оцениваемые действия включают объединенные запросы на вытягивание, закрытые проблемы (кроме тех, которые явно удалены из списка за неконструктивность) и комментарии. Pull-реквесты: перечислены только закрытые pull-запросы и определен их статус слияния, наконец, объединенные пул-реквесты сохраняются в файле JSON с записями, содержащими ссылочный номер, репозиторий, заголовок, автора и URL запроса на вытягивание. Проблемы: перечислены только закрытые проблемы, запросы на вытягивание (которые также рассматриваются как проблемы API GitHub) удалены из них, вопросы, внесенные в черный список в файле blacklist.lst
, также удаляются из списка, и, наконец, оставшиеся проблемы сохраняются в файле JSON с записями, содержащими ссылочный номер, репозиторий, заголовок, автора и URL-адрес выпуска. Комментарии: Комментарии из коммитов, из проблем и из запросов на вытягивание перечислены и сохранены в файле JSON с записями, содержащими автора, идентификатор комментария, репозиторий, содержимое комментария и URL-адрес комментария. Комментарии к задачам и комментарии к запросу на вытягивание также будут содержать справочный номер задачи, соответственно, запроса на вытягивание, при этом проблемы дополнительно имеют также первоначального автора проблемы, в то время как комментарии коммита будут содержать SHA1 фиксации.
#!/usr/bin/env python3
# requests library is used to access GitHub API
import requests
import json
import re
import sys
import os.path
import bisect
import getpass
from itertools import chain
import repos
year = repos.get_year()
path = '../../{}/rezultate/'.format(year)
# read the list of repositories
repo_name = re.compile(r'([\w-]+)/([\w-]+)')
repos = [('etc-so', 'syllabus')]
with open(path + 'data/repos.lst', 'r') as f:
for s in f:
s = s.strip()
# ignore empty lines
if s and not s.startswith('#'):
m = repo_name.fullmatch(s)
if not m:
print("Invalid repo name: " + s)
repos.append((m.group(1), m.group(2)))
# read the list of students
students = []
with open(path + 'data/students.json', 'r') as f:
students = [x['user'] for x in json.load(f)]
if not students:
print("No students to check for")
sys.exit(1)
# get the access token
if os.path.exists('github.token'):
with open('github.token', 'r') as f:
auth = (f.readline().strip(), f.readline().strip())
else:
auth = (input("Enter GitHub username: "), getpass.getpass('Password: '))
# construct a labels list, so it can be added to the comments as well
issue_labels = {}
# get the persistent PR, issue and comment lists
if os.path.exists(path + 'data/pulls.json'):
with open(path + 'data/pulls.json', 'r') as f:
saved_pulls = dict((x['id'],(x['week'], x['value'], x['author']) + ((x['multi'],) if 'multi' in x else ())) for x in json.load(f))
else:
saved_pulls = {}
if os.path.exists(path + 'data/issues.json'):
with open(path + 'data/issues.json', 'r') as f:
saved_issues = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
saved_issues = {}
if os.path.exists(path + 'data/comments.json'):
with open(path + 'data/comments.json', 'r') as f:
saved_comments = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
saved_comments = {}
current_week = 0
# if there were already some activity then move ahead to the last week
if saved_pulls or saved_issues or saved_comments:
current_week = max(x[0] for x in chain(saved_pulls.values(),
saved_issues.values(),
saved_comments.values()))
# if there is a command line argument use it as current week
if len(sys.argv) == 2:
if sys.argv[1].isdigit():
current_week = int(sys.argv[1])
# -n increments the current week if it can be obtained from the activity
elif sys.argv[1] == '-n' and current_week != 0:
current_week += 1
print("Switching to week {}".format(current_week))
elif sys.argv[1] == '?':
print("Current week is {}".format(current_week))
sys.exit(0)
else:
print("Invalid command line parameter")
sys.exit(1)
elif len(sys.argv) > 2:
print("Too many parameters")
sys.exit(1)
# if no current week was obtained, start with week 1
if not current_week:
current_week = 1
api_url = "https://api.github.com/"
get_url = "repos/{}/{}/{}"
c_params = {
'state': 'closed', # get closed pull-requests/issues
'sort': 'created',
'direction': 'asc', # sort it in ascending order by their creation time
}
pr = []
pr_times = []
gi = []
comments = []
c_times = []
authors = {}
issue_comment_re = re.compile(r'.*/(pull|issues)/([0-9]+)#.*')
link_rel = re.compile(r'<([^>]+)>; rel="([\w]+)"(?:, )?')
request_count = 0
def github_get(get_code, **params):
global request_count
global raw_data
res = []
url = api_url + get_url.format(org, repo, get_code)
while url:
r = requests.get(url, auth=auth, params=params)
if r.status_code != 200:
print('Status code {} received'.format(r.status_code))
print(r.content)
sys.exit(1)
links = dict((m[1],m[0]) for m in link_rel.findall(r.headers.get('link', '')))
url = links.get('next')
res.extend(r.json())
request_count = r.headers['X-RateLimit-Remaining']
return res
label_values = {
'bug': 2,
'enhancement': 2,
'challenge': 2,
'help wanted': 1,
'invalid': -1,
'duplicate': -2
}
def label_value(labels):
# check predefined values
values = [label_values[x] for x in labels if x in label_values]
values += [x.count('★') for x in labels]
if values:
if min(values) < 0:
return min(values)
value = max(values)
else:
value = 0
# add all white stars
value += sum(x.count('☆') for x in labels)
return value
def issue_value(value, issue_id=None):
if issue_id:
old_value = saved_issues.get(issue_id)
if old_value and old_value[1] != value:
print("Value change detected in issue {}".format(issue_id))
return value
def pull_value(label, pull_id=None, ref=0):
if label:
value = label_value(label)
else:
value = None
print("No label for pull-request {}".format(ref))
if pull_id:
old_value = saved_pulls.get(pull_id)
if old_value and old_value[1] is not None and old_value[1] != value:
print("Value change detected in pull-request {}".format(ref))
if value is None:
value = old_value[1]
return value
ptotal = 0
itotal = 0
ctotal = 0
# pass through the repos
for org,repo in repos:
print('Processing repo {}:'.format(repo), end=' ', flush=True)
# get all the issues, do this first as it all includes the pull-requests
# for which only here we can determine the labels
issues = github_get('issues', state='all', sort='created', direction='asc')
icount = 0;
for i in issues:
ref = int(i['number'])
author = i['user']['login']
authors[(repo, ref)] = author
label = [x['name'] for x in i['labels']]
issue_labels[(repo, ref)] = label
if 'pull_request' not in i and author in students:
gi.append(
{
'id': i['id'],
'repo': repo,
'ref': ref,
'title': i['title'],
'url': i['html_url'],
'author': author,
'label': label,
'week' : saved_issues.get(i['id'], [current_week])[0],
'value' : issue_value(label_value(label), i['id'])
})
icount += 1
# get the merged pull-requests
pulls = github_get('pulls', state='closed')
pcount = 0;
#print(r.headers)
for p in pulls:
ref = int(p['number'])
author = p['user']['login']
label = issue_labels.get((repo, ref), [])
if p['merged_at'] and author in students and (not label or label_value(label) >= 0):
i = bisect.bisect_right(pr_times, p['merged_at'])
pr_times.insert(i, p['merged_at'])
# check authors of included commits
if p['id'] in saved_pulls:
author = saved_pulls[p['id']][2]
else:
pcommits = github_get('pulls/{}/commits'.format(ref))
pc_authors = [author]
for a in pcommits:
al = a['author'].get('login') if a['author'] else None
cl = a['committer'].get('login') if a['committer'] else None
if al == cl or cl == 'web-flow' or cl == 'kcs':
aa = al
else:
aa = ':'.join(x for x in [al, cl] if x)
if aa and aa not in pc_authors and aa != 'kcs':
pc_authors.append(aa)
if len(pc_authors) != 1:
author = pc_authors
pr.insert(i, {
'id': p['id'],
'repo': repo,
'ref': ref,
'title': p['title'],
'url': p['html_url'],
'label': label,
'author': author,
'week': saved_pulls.get(p['id'], [current_week])[0],
'value': pull_value(label, p['id'], ref)
})
if len(saved_pulls.get(p['id'], [])) > 3:
pr[i]['multi'] = saved_pulls[p['id']][3]
pcount += 1
# and now for the comments:
# this is more troublesome as constructive comments must be selected
# manually, so we are keeping persistent JSON file for the comments
# holding a valid tag, newly downloaded comments have this tag unset
# and they can be validated afterwards manually (or by script)
ccount = 0;
# 3 types of comments exist on GitHub: issue comments, review comments
# and commit comments, all have to be handled separately
ccomments = github_get('comments', per_page=50, sort='created')
#with open('all_ccomments.json', 'w') as f:
# json.dump(ccomments, f, indent=4, sort_keys=True)
for c in ccomments:
author = c['user']['login']
if author in students:
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'commit': c['commit_id'],
'msg': c['body'],
'url': c['html_url'],
'author': author,
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
ccount += 1
icomments = github_get('issues/comments', per_page=50, sort='created')
for c in icomments:
author = c['user']['login']
if author in students:
url = c['html_url']
m = issue_comment_re.fullmatch(url)
if not m:
print("Problem parsing issue url " + url)
sys.exit(1)
ref = int(m.group(2))
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'issue': ref,
'msg': c['body'],
'url': url,
'author': author,
'issue_author': authors[(repo, ref)],
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
if m.group(1) == 'issues' and (repo, ref) in issue_labels:
comments[i]['label'] = issue_labels[(repo, ref)]
ccount += 1
pcomments = github_get('pulls/comments', per_page=50, sort='created')
for c in pcomments:
author = c['user']['login']
if author in students:
ref = int(c['pull_request_url'].rsplit('/', 1)[1])
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'pull': ref,
'msg': c['body'],
'url': c['html_url'],
'author': author,
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
ccount += 1
print('found {} merged pull-requests, {} issues and {} comments'.format(pcount, icount, ccount))
ptotal += pcount
itotal += icount
ctotal += ccount
with open(path + 'data/pulls.json', 'w') as f:
json.dump(pr, f, indent=4, sort_keys=True, ensure_ascii=False)
with open(path + 'data/issues.json', 'w') as f:
json.dump(gi, f, indent=4, sort_keys=True, ensure_ascii=False)
with open(path + 'data/comments.json', 'w') as f:
json.dump(comments, f, indent=4, sort_keys=True, ensure_ascii=False)
print("Total: {} merged pull-requests, {} issues and {} comments [{} contributions]".format(
ptotal, itotal, ctotal, ptotal + itotal + ctotal))
print("Remaining request count: {}".format(request_count))
repo_key = dict((r[1],i) for i,r in enumerate(repos))
def sort_repos(x):
'''Sort the repos in a blacklist with (repo,ref) structure in the order
as they appear in the repos list.
If repo is not in the list then put them afterwards
'''
if x[0] in repo_key:
return (repo_key[x[0]],x[1])
else:
return (len(repos),) + x