| 1 | #!/usr/bin/env python |
|---|
| 2 | |
|---|
| 3 | import re |
|---|
| 4 | import optparse |
|---|
| 5 | import datetime |
|---|
| 6 | import math |
|---|
| 7 | import random |
|---|
| 8 | import os |
|---|
| 9 | import logging |
|---|
| 10 | |
|---|
| 11 | class Rev(object): |
|---|
| 12 | """Revision object.""" |
|---|
| 13 | |
|---|
| 14 | def __init__(self, revid, date): |
|---|
| 15 | self.revid = revid |
|---|
| 16 | self.date = date |
|---|
| 17 | self.contributions = {} |
|---|
| 18 | self.comments = {} |
|---|
| 19 | |
|---|
| 20 | def __str__(self): |
|---|
| 21 | return "r%s: %s" % (self.revid, self.date) |
|---|
| 22 | |
|---|
| 23 | def add_contribution(self, user, path): |
|---|
| 24 | if user in self.contributions: |
|---|
| 25 | self.contributions[user].add(path) |
|---|
| 26 | else: |
|---|
| 27 | self.contributions[user] = set([path]) |
|---|
| 28 | |
|---|
| 29 | def add_comment(self, user, comment): |
|---|
| 30 | if user in self.comments: |
|---|
| 31 | self.comments[user].add(comment) |
|---|
| 32 | else: |
|---|
| 33 | self.comments[user] = set([comment]) |
|---|
| 34 | |
|---|
| 35 | |
|---|
| 36 | class SvnLogParser(object): |
|---|
| 37 | |
|---|
| 38 | START_RE = re.compile(r"^----+$") |
|---|
| 39 | |
|---|
| 40 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S' |
|---|
| 41 | |
|---|
| 42 | def _start_state(self, line, state): |
|---|
| 43 | """Process start line, return new state.""" |
|---|
| 44 | if self.START_RE.match(line): |
|---|
| 45 | return self._first_line_state |
|---|
| 46 | return self._start_state |
|---|
| 47 | |
|---|
| 48 | def _first_line_state(self, line, state): |
|---|
| 49 | """Process first line, move to paths.""" |
|---|
| 50 | revid, user, date, lines = [part.strip() for part in line.split('|')] |
|---|
| 51 | revid = int(revid[1:]) |
|---|
| 52 | date = datetime.datetime.strptime(" ".join(date.split()[:2]), self.DATE_FORMAT) |
|---|
| 53 | lines = int(lines.split()[0]) |
|---|
| 54 | |
|---|
| 55 | state['rev'] = Rev(revid, date) |
|---|
| 56 | state['user'] = user |
|---|
| 57 | state['lines_remaining'] = lines |
|---|
| 58 | |
|---|
| 59 | return self._path_state |
|---|
| 60 | |
|---|
| 61 | def _path_state(self, line, state): |
|---|
| 62 | if line == "Changed paths:": |
|---|
| 63 | return self._path_state |
|---|
| 64 | elif not line.strip(): |
|---|
| 65 | return self._comment_state |
|---|
| 66 | else: |
|---|
| 67 | change, filename = line.split()[:2] |
|---|
| 68 | state['rev'].add_contribution(state['user'], filename) |
|---|
| 69 | return self._path_state |
|---|
| 70 | |
|---|
| 71 | def _comment_state(self, line, state): |
|---|
| 72 | state.setdefault('comment', '') |
|---|
| 73 | state['comment'] += line |
|---|
| 74 | state['lines_remaining'] -= 1 |
|---|
| 75 | if state['lines_remaining'] > 0: |
|---|
| 76 | return self._comment_state |
|---|
| 77 | else: |
|---|
| 78 | state['rev'].add_comment(state['user'], state['comment']) |
|---|
| 79 | state['done'] = True |
|---|
| 80 | return self._start_state |
|---|
| 81 | |
|---|
| 82 | def parse(self, logfile): |
|---|
| 83 | """Parse svn log -v file.""" |
|---|
| 84 | # Example log entry: |
|---|
| 85 | # |
|---|
| 86 | # ------------------------------------------------------------------------ |
|---|
| 87 | # r371 | jerith | 2009-09-12 18:01:06 +0200 (Sat, 12 Sep 2009) | 1 line |
|---|
| 88 | # Changed paths: |
|---|
| 89 | # M /gamelib/animal.py |
|---|
| 90 | # |
|---|
| 91 | # Foxes are more averse to armed chickens and less averse to indoor chickens. |
|---|
| 92 | |
|---|
| 93 | state = {} |
|---|
| 94 | state_func = self._start_state |
|---|
| 95 | |
|---|
| 96 | for line in logfile: |
|---|
| 97 | state_func = state_func(line, state) |
|---|
| 98 | if 'done' in state: |
|---|
| 99 | yield state['rev'] |
|---|
| 100 | state.clear() |
|---|
| 101 | |
|---|
| 102 | |
|---|
| 103 | class Vector(object): |
|---|
| 104 | """Really simple class representing a row of card table data.""" |
|---|
| 105 | |
|---|
| 106 | METRICS = {} |
|---|
| 107 | |
|---|
| 108 | def __init__(self, aData): |
|---|
| 109 | self._aData = aData |
|---|
| 110 | |
|---|
| 111 | # We do want to access _aData on other Vectors. |
|---|
| 112 | # pylint: disable-msg=W0212 |
|---|
| 113 | |
|---|
| 114 | def euclidian_distance(self, oVec2): |
|---|
| 115 | """Euclidean distance between two vectors.""" |
|---|
| 116 | assert len(self._aData) == len(oVec2._aData) |
|---|
| 117 | fSum = sum(((x-y)**2 for (x, y) in zip(self._aData, oVec2._aData))) |
|---|
| 118 | return math.sqrt(fSum) |
|---|
| 119 | |
|---|
| 120 | METRICS['Euclidean Distance'] = euclidian_distance |
|---|
| 121 | |
|---|
| 122 | def sutekh_distance(self, oVec2): |
|---|
| 123 | """Like Euclidean distance, but -1 is distance 0.25 to anything |
|---|
| 124 | (including 0 and -1) and 0 is distance 4.0 from anything other |
|---|
| 125 | than 0. |
|---|
| 126 | """ |
|---|
| 127 | assert len(self._aData) == len(oVec2._aData) |
|---|
| 128 | fSum = sum(( |
|---|
| 129 | ((x == -1 or y == -1) and 0.25) or |
|---|
| 130 | (((x == 0) ^ (y == 0) and 4.0)) or |
|---|
| 131 | (x-y)**2 for (x, y) in zip(self._aData, oVec2._aData) |
|---|
| 132 | )) |
|---|
| 133 | return math.sqrt(fSum) |
|---|
| 134 | |
|---|
| 135 | METRICS['Sutekh Distance'] = sutekh_distance |
|---|
| 136 | |
|---|
| 137 | # Other possibile metrics: |
|---|
| 138 | # 'City Block Distance', |
|---|
| 139 | # 'Correlation', |
|---|
| 140 | # 'Absolute Value of the Correlation', |
|---|
| 141 | # 'Uncentered Correlation', |
|---|
| 142 | # 'Absolute Value of the Uncentered Correlation', |
|---|
| 143 | # "Spearman's Rank Correlation" |
|---|
| 144 | # "Kendall's Tau" |
|---|
| 145 | |
|---|
| 146 | def __add__(self, oOther): |
|---|
| 147 | """Sum this vector with another.""" |
|---|
| 148 | if oOther == 0: |
|---|
| 149 | return self |
|---|
| 150 | assert len(self._aData) == len(oOther._aData) |
|---|
| 151 | return Vector([ x+y for (x, y) in zip(self._aData, oOther._aData)]) |
|---|
| 152 | |
|---|
| 153 | def __radd__(self, oOther): |
|---|
| 154 | """Sum this vector with another.""" |
|---|
| 155 | if oOther == 0: |
|---|
| 156 | return self |
|---|
| 157 | assert len(self._aData) == len(oOther._aData) |
|---|
| 158 | return Vector([ x+y for (x, y) in zip(self._aData, oOther._aData)]) |
|---|
| 159 | |
|---|
| 160 | def __mul__(self, fScale): |
|---|
| 161 | """Multiply this vector by a scalar.""" |
|---|
| 162 | fScale = float(fScale) |
|---|
| 163 | return Vector([x*fScale for x in self._aData]) |
|---|
| 164 | |
|---|
| 165 | def __rmul__(self, fScale): |
|---|
| 166 | """Multiply this vector by a scalar.""" |
|---|
| 167 | fScale = float(fScale) |
|---|
| 168 | return Vector([x*fScale for x in self._aData]) |
|---|
| 169 | |
|---|
| 170 | def __len__(self): |
|---|
| 171 | """Length of this vector.""" |
|---|
| 172 | return len(self._aData) |
|---|
| 173 | |
|---|
| 174 | def __str__(self): |
|---|
| 175 | """String representation of this vector.""" |
|---|
| 176 | return str(self._aData) |
|---|
| 177 | |
|---|
| 178 | def __iter__(self): |
|---|
| 179 | """Iterator.""" |
|---|
| 180 | return iter(self._aData) |
|---|
| 181 | |
|---|
| 182 | |
|---|
| 183 | def k_means_plus_plus(aCards, iNumClust, fDist): |
|---|
| 184 | """Find a set of initial centers using the k-means++ algorithm. |
|---|
| 185 | |
|---|
| 186 | See http://www.stanford.edu/~darthur/kMeansPlusPlus.pdf. |
|---|
| 187 | """ |
|---|
| 188 | aMeans = [ random.choice(aCards) ] |
|---|
| 189 | |
|---|
| 190 | while len(aMeans) < iNumClust: |
|---|
| 191 | aDists = [] |
|---|
| 192 | for oVec in aCards: |
|---|
| 193 | fMinD = min((fDist(oVec, oMean) for oMean in aMeans))**2 |
|---|
| 194 | aDists.append(fMinD) |
|---|
| 195 | |
|---|
| 196 | fSumSq = sum(aDists) |
|---|
| 197 | fPick = random.uniform(0, fSumSq) |
|---|
| 198 | |
|---|
| 199 | for iCard, fMinD in enumerate(aDists): |
|---|
| 200 | fPick -= fMinD |
|---|
| 201 | if fPick <= 0: |
|---|
| 202 | break |
|---|
| 203 | |
|---|
| 204 | # iCard is defined because k_means doesn't call |
|---|
| 205 | # this unless aCards is non-empty |
|---|
| 206 | # pylint: disable-msg=W0631 |
|---|
| 207 | |
|---|
| 208 | if iCard == len(aCards): |
|---|
| 209 | # guard against slight possibility of being very close |
|---|
| 210 | # to end of fSumSq. |
|---|
| 211 | iCard -= 1 |
|---|
| 212 | |
|---|
| 213 | aMeans.append(aCards[iCard]) |
|---|
| 214 | |
|---|
| 215 | return aMeans |
|---|
| 216 | |
|---|
| 217 | def k_means(aCards, iNumClust, iIterations, fDist): |
|---|
| 218 | """Perform k-means clustering on a list of cards using Lloyd's |
|---|
| 219 | algorithm.""" |
|---|
| 220 | if (not aCards) or (not aCards[0]): |
|---|
| 221 | # empty card set or zero-length vectors |
|---|
| 222 | return [], [] |
|---|
| 223 | |
|---|
| 224 | aMeans = k_means_plus_plus(aCards, iNumClust, fDist) |
|---|
| 225 | iCards = len(aCards) |
|---|
| 226 | |
|---|
| 227 | # just do a fixed number of interations (no complex stopping condition) |
|---|
| 228 | for _iIter in range(iIterations): |
|---|
| 229 | # empty clusters |
|---|
| 230 | aClusters = [] |
|---|
| 231 | for iClust in xrange(iNumClust): |
|---|
| 232 | aClusters.append([]) |
|---|
| 233 | |
|---|
| 234 | # calculate membership in clusters |
|---|
| 235 | for iCard in xrange(iCards): |
|---|
| 236 | oVec = aCards[iCard] |
|---|
| 237 | iVmin = min(xrange(iNumClust), |
|---|
| 238 | key=lambda iV: fDist(oVec, aMeans[iV])) |
|---|
| 239 | aClusters[iVmin].append(iCard) |
|---|
| 240 | |
|---|
| 241 | # recompute the centroids |
|---|
| 242 | for iClust in xrange(iNumClust): |
|---|
| 243 | if aClusters[iClust]: |
|---|
| 244 | aMeans[iClust] = (1.0 / len(aClusters[iClust])) * sum( |
|---|
| 245 | (aCards[x] for x in aClusters[iClust]) |
|---|
| 246 | ) |
|---|
| 247 | |
|---|
| 248 | return aMeans, aClusters |
|---|
| 249 | |
|---|
| 250 | |
|---|
| 251 | def plotrevs(users, paths, data, is_date=False, linestyle="o-"): |
|---|
| 252 | """Plot a list of revision objects""" |
|---|
| 253 | import matplotlib.pyplot as plt |
|---|
| 254 | import pylab |
|---|
| 255 | |
|---|
| 256 | common = len(os.path.commonprefix(paths)) |
|---|
| 257 | paths = [p[common:] for p in paths] |
|---|
| 258 | for user, points in data.items(): |
|---|
| 259 | data[user] = [(d[0], d[1][common:]) for d in points] |
|---|
| 260 | |
|---|
| 261 | pathdict = dict(zip(paths, range(len(paths)))) |
|---|
| 262 | userdict = dict(zip(users, range(len(users)))) |
|---|
| 263 | usercolor = dict(zip(users, ['r', 'g', 'b', 'c', 'm', 'y'])) |
|---|
| 264 | |
|---|
| 265 | if is_date: |
|---|
| 266 | plotter = plt.plot_date |
|---|
| 267 | xgetter = lambda r: r.date |
|---|
| 268 | else: |
|---|
| 269 | plotter = plt.plot |
|---|
| 270 | xgetter = lambda r: r.revid |
|---|
| 271 | |
|---|
| 272 | for user in users: |
|---|
| 273 | points = data[user] |
|---|
| 274 | x = [xgetter(p[0]) for p in points] |
|---|
| 275 | y = [pathdict[p[1]]+0.1*userdict[user]-0.2 for p in points] |
|---|
| 276 | plotter(x, y, |
|---|
| 277 | usercolor[user] + '-', |
|---|
| 278 | alpha=0.25, |
|---|
| 279 | linewidth=1.0, |
|---|
| 280 | ) |
|---|
| 281 | plotter(x, y, |
|---|
| 282 | usercolor[user] + linestyle, |
|---|
| 283 | label=user, |
|---|
| 284 | markersize=8.0, |
|---|
| 285 | linewidth=1.0, |
|---|
| 286 | picker=4.0, |
|---|
| 287 | ) |
|---|
| 288 | |
|---|
| 289 | plt.yticks([y for y in range(len(paths))], paths) |
|---|
| 290 | plt.grid(color='grey', linestyle='-', alpha=0.25, ydata=[y for y in range(0,len(paths)+1,2)], linewidth=24.0) |
|---|
| 291 | plt.ylabel('File') |
|---|
| 292 | plt.xlabel(is_date and 'Date' or 'Revision') |
|---|
| 293 | plt.ylim(-0.5,len(paths)-0.5) |
|---|
| 294 | plt.legend(loc=0) |
|---|
| 295 | |
|---|
| 296 | fig = plt.figure(1) |
|---|
| 297 | xmin, xmax = plt.xlim() |
|---|
| 298 | ymin, ymax = plt.ylim() |
|---|
| 299 | ann = plt.annotate("default", (xmin + (xmax-xmin)*0.05, ymin + (ymax-ymin)*0.05), |
|---|
| 300 | backgroundcolor="yellow", |
|---|
| 301 | bbox=dict(boxstyle="round", fc="0.8"), |
|---|
| 302 | visible=False, |
|---|
| 303 | ) |
|---|
| 304 | #ann.set_animated(True) |
|---|
| 305 | current_rev = [-1] |
|---|
| 306 | |
|---|
| 307 | def onpick(event): |
|---|
| 308 | line = event.artist |
|---|
| 309 | ind = event.ind[0] |
|---|
| 310 | x, y = line.get_xdata()[ind], line.get_ydata()[ind] |
|---|
| 311 | user = line.get_label() |
|---|
| 312 | rev = data[user][ind][0] |
|---|
| 313 | comment = list(rev.comments[user])[0] |
|---|
| 314 | text = "%s [%s]: %s" % (user, rev.revid, comment) |
|---|
| 315 | text = text[:150] |
|---|
| 316 | |
|---|
| 317 | if ann.get_visible() and current_rev[0] == rev.revid: |
|---|
| 318 | # print "II:", rev.revid |
|---|
| 319 | ann.set_visible(False) |
|---|
| 320 | current_rev[0] = -1 |
|---|
| 321 | else: |
|---|
| 322 | # print "VV:", rev.revid |
|---|
| 323 | ann.set_text(text) |
|---|
| 324 | ann.set_visible(True) |
|---|
| 325 | current_rev[0] = rev.revid |
|---|
| 326 | |
|---|
| 327 | pylab.draw() |
|---|
| 328 | |
|---|
| 329 | cid = fig.canvas.mpl_connect('pick_event', onpick) |
|---|
| 330 | |
|---|
| 331 | plt.show() |
|---|
| 332 | |
|---|
| 333 | def minimize_rev_separation(revs, paths): |
|---|
| 334 | """Attempt to minimize the maximum distance between paths in |
|---|
| 335 | a revision. |
|---|
| 336 | """ |
|---|
| 337 | import numpy |
|---|
| 338 | |
|---|
| 339 | pathindex = dict(zip(paths, range(len(paths)))) |
|---|
| 340 | revindex = [] |
|---|
| 341 | userindex = {} |
|---|
| 342 | for rev in revs: |
|---|
| 343 | for r_user, r_paths in rev.contributions.items(): |
|---|
| 344 | idxs = [pathindex[r] for r in r_paths if r in pathindex] |
|---|
| 345 | if idxs: |
|---|
| 346 | revindex.append(idxs) |
|---|
| 347 | userindex.setdefault(r_user,[]) |
|---|
| 348 | userindex[r_user].extend(idxs) |
|---|
| 349 | userindex = userindex.values() |
|---|
| 350 | |
|---|
| 351 | allindex = userindex |
|---|
| 352 | |
|---|
| 353 | def func(x): |
|---|
| 354 | tot = 0.0 |
|---|
| 355 | for idxs in allindex: |
|---|
| 356 | places = x.take(idxs) |
|---|
| 357 | tot += sum((places[1:] - places[:-1])**2) |
|---|
| 358 | return tot |
|---|
| 359 | |
|---|
| 360 | x0 = numpy.array(range(len(paths)), dtype=float) |
|---|
| 361 | best_tot = func(x0) |
|---|
| 362 | path_min = x0.copy() |
|---|
| 363 | logging.info("Initial objective function value: %s", best_tot) |
|---|
| 364 | for i in range(1000): |
|---|
| 365 | random.shuffle(x0) |
|---|
| 366 | tot = func(x0) |
|---|
| 367 | if tot < best_tot: |
|---|
| 368 | best_tot = tot |
|---|
| 369 | path_min = x0.copy() |
|---|
| 370 | logging.info("Final objective function value:", best_tot) |
|---|
| 371 | |
|---|
| 372 | path_pairs = zip(paths, path_min) |
|---|
| 373 | path_pairs.sort(key=lambda p: p[1]) |
|---|
| 374 | return [p[0] for p in path_pairs] |
|---|
| 375 | |
|---|
| 376 | def main(args): |
|---|
| 377 | |
|---|
| 378 | parser = optparse.OptionParser(version="%prog 0.1") |
|---|
| 379 | parser.add_option("-d", "--bydate", dest="bydate", |
|---|
| 380 | action="store_true", |
|---|
| 381 | help="plot by date rather than revision id.") |
|---|
| 382 | parser.add_option("-l", "--linestyle", dest="linestyle", |
|---|
| 383 | default="o-", |
|---|
| 384 | help="matplotlib line format") |
|---|
| 385 | parser.add_option("-s", "--seed", dest="seed", |
|---|
| 386 | type="int", |
|---|
| 387 | default=42, |
|---|
| 388 | help="set random module seed.") |
|---|
| 389 | parser.add_option("-f", "--filter", dest="filter", |
|---|
| 390 | default=r".*\.py$", |
|---|
| 391 | help="regular expression filter for file names") |
|---|
| 392 | |
|---|
| 393 | (options, args) = parser.parse_args() |
|---|
| 394 | |
|---|
| 395 | random.seed(options.seed) |
|---|
| 396 | |
|---|
| 397 | if len(args) < 1: |
|---|
| 398 | print parser.get_usage() |
|---|
| 399 | return 1 |
|---|
| 400 | |
|---|
| 401 | logfile = args[0] |
|---|
| 402 | |
|---|
| 403 | parser = SvnLogParser() |
|---|
| 404 | |
|---|
| 405 | revs = list(parser.parse(file(logfile))) |
|---|
| 406 | revs.sort(key=lambda rev: rev.date) |
|---|
| 407 | |
|---|
| 408 | users, paths = set(), set() |
|---|
| 409 | data = {} |
|---|
| 410 | path_vecs = {} |
|---|
| 411 | |
|---|
| 412 | path_revs = {} |
|---|
| 413 | |
|---|
| 414 | path_re = re.compile(options.filter) |
|---|
| 415 | |
|---|
| 416 | for i, rev in enumerate(revs): |
|---|
| 417 | for r_user, r_paths in rev.contributions.iteritems(): |
|---|
| 418 | r_paths = [p for p in r_paths if path_re.match(p)] |
|---|
| 419 | users.add(r_user) |
|---|
| 420 | paths.update(r_paths) |
|---|
| 421 | |
|---|
| 422 | data.setdefault(r_user, []) |
|---|
| 423 | data[r_user].extend((rev, p) for p in r_paths) |
|---|
| 424 | |
|---|
| 425 | for r_path in r_paths: |
|---|
| 426 | path_revs.setdefault(r_path, []) |
|---|
| 427 | path_revs[r_path].append(i) |
|---|
| 428 | |
|---|
| 429 | # for r_path in r_paths: |
|---|
| 430 | # path_vecs.setdefault(r_path, {}) |
|---|
| 431 | # path_vecs[r_path].setdefault(r_user, 0.0) |
|---|
| 432 | # path_vecs[r_path][r_user] += 1.0 |
|---|
| 433 | |
|---|
| 434 | users = sorted(users) |
|---|
| 435 | |
|---|
| 436 | # paths = sorted(paths, key=lambda p: (len(path_revs[p]), max(path_revs[p]))) |
|---|
| 437 | # ordered_paths = paths |
|---|
| 438 | |
|---|
| 439 | ordered_paths = minimize_rev_separation(revs, paths) |
|---|
| 440 | |
|---|
| 441 | # path_vecs = path_vecs.items() |
|---|
| 442 | # path_names = [pv[0] for pv in path_vecs] |
|---|
| 443 | # path_dicts = [pv[1] for pv in path_vecs] |
|---|
| 444 | # path_vectors = [Vector([u in d and d[u] or 0.0 for u in users]) for d in path_dicts] |
|---|
| 445 | # path_vectors = [v*(1.0/sum(v)) for v in path_vectors] |
|---|
| 446 | # |
|---|
| 447 | # means, clusters = k_means(path_vectors, len(users)+2, 100, Vector.euclidian_distance) |
|---|
| 448 | # |
|---|
| 449 | # clusters.sort(key=lambda c: len(c)) |
|---|
| 450 | # |
|---|
| 451 | # ordered_paths = [] |
|---|
| 452 | # print "Clusters:" |
|---|
| 453 | # for cluster in clusters: |
|---|
| 454 | # print "\t", [path_names[i] for i in cluster] |
|---|
| 455 | # ordered_paths.extend(path_names[i] for i in cluster) |
|---|
| 456 | |
|---|
| 457 | plotrevs(users, ordered_paths, data, options.bydate, options.linestyle) |
|---|
| 458 | |
|---|
| 459 | |
|---|
| 460 | if __name__ == "__main__": |
|---|
| 461 | import sys |
|---|
| 462 | sys.exit(main(sys.argv)) |
|---|
| 463 | |
|---|