root/hodgestar/PythonCode/SvnDots/svndots.py

Revision 673, 13.8 kB (checked in by hodgestar, 3 years ago)

Add custom path ordering optimization (reduces distance between consecutive commits by the same user).

Line 
1#!/usr/bin/env python
2
3import re
4import optparse
5import datetime
6import math
7import random
8import os
9import logging
10
11class Rev(object):
12    """Revision object."""
13
14    def __init__(self, revid, date):
15        self.revid = revid
16        self.date = date
17        self.contributions = {}
18        self.comments = {}
19
20    def __str__(self):
21        return "r%s: %s" % (self.revid, self.date)
22
23    def add_contribution(self, user, path):
24        if user in self.contributions:
25            self.contributions[user].add(path)
26        else:
27            self.contributions[user] = set([path])
28
29    def add_comment(self, user, comment):
30        if user in self.comments:
31            self.comments[user].add(comment)
32        else:
33            self.comments[user] = set([comment])
34
35
36class SvnLogParser(object):
37
38    START_RE = re.compile(r"^----+$")
39
40    DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
41
42    def _start_state(self, line, state):
43        """Process start line, return new state."""
44        if self.START_RE.match(line):
45            return self._first_line_state
46        return self._start_state
47
48    def _first_line_state(self, line, state):
49        """Process first line, move to paths."""
50        revid, user, date, lines = [part.strip() for part in line.split('|')]
51        revid = int(revid[1:])
52        date = datetime.datetime.strptime(" ".join(date.split()[:2]), self.DATE_FORMAT)
53        lines = int(lines.split()[0])
54
55        state['rev'] = Rev(revid, date)
56        state['user'] = user
57        state['lines_remaining'] = lines
58
59        return self._path_state
60
61    def _path_state(self, line, state):
62        if line == "Changed paths:":
63            return self._path_state
64        elif not line.strip():
65            return self._comment_state
66        else:
67            change, filename = line.split()[:2]
68            state['rev'].add_contribution(state['user'], filename)
69            return self._path_state
70
71    def _comment_state(self, line, state):
72        state.setdefault('comment', '')
73        state['comment'] += line
74        state['lines_remaining'] -= 1
75        if state['lines_remaining'] > 0:
76            return self._comment_state
77        else:
78            state['rev'].add_comment(state['user'], state['comment'])
79            state['done'] = True
80            return self._start_state
81
82    def parse(self, logfile):
83        """Parse svn log -v file."""
84        # Example log entry:
85        #
86        # ------------------------------------------------------------------------
87        # r371 | jerith | 2009-09-12 18:01:06 +0200 (Sat, 12 Sep 2009) | 1 line
88        # Changed paths:
89        #    M /gamelib/animal.py
90        #
91        # Foxes are more averse to armed chickens and less averse to indoor chickens.
92
93        state = {}
94        state_func = self._start_state
95
96        for line in logfile:
97            state_func = state_func(line, state)
98            if 'done' in state:
99                yield state['rev']
100                state.clear()
101
102
103class Vector(object):
104    """Really simple class representing a row of card table data."""
105
106    METRICS = {}
107
108    def __init__(self, aData):
109        self._aData = aData
110
111    # We do want to access _aData on other Vectors.
112    # pylint: disable-msg=W0212
113
114    def euclidian_distance(self, oVec2):
115        """Euclidean distance between two vectors."""
116        assert len(self._aData) == len(oVec2._aData)
117        fSum = sum(((x-y)**2 for (x, y) in zip(self._aData, oVec2._aData)))
118        return math.sqrt(fSum)
119
120    METRICS['Euclidean Distance'] = euclidian_distance
121
122    def sutekh_distance(self, oVec2):
123        """Like Euclidean distance, but -1 is distance 0.25 to anything
124           (including 0 and -1) and 0 is distance 4.0 from anything other
125           than 0.
126           """
127        assert len(self._aData) == len(oVec2._aData)
128        fSum = sum((
129            ((x == -1 or y == -1) and 0.25) or
130            (((x == 0) ^ (y == 0) and 4.0)) or
131            (x-y)**2 for (x, y) in zip(self._aData, oVec2._aData)
132        ))
133        return math.sqrt(fSum)
134
135    METRICS['Sutekh Distance'] = sutekh_distance
136
137    # Other possibile metrics:
138    #  'City Block Distance',
139    #  'Correlation',
140    #  'Absolute Value of the Correlation',
141    #  'Uncentered Correlation',
142    #  'Absolute Value of the Uncentered Correlation',
143    #  "Spearman's Rank Correlation"
144    #  "Kendall's Tau"
145
146    def __add__(self, oOther):
147        """Sum this vector with another."""
148        if oOther == 0:
149            return self
150        assert len(self._aData) == len(oOther._aData)
151        return Vector([ x+y for (x, y) in zip(self._aData, oOther._aData)])
152
153    def __radd__(self, oOther):
154        """Sum this vector with another."""
155        if oOther == 0:
156            return self
157        assert len(self._aData) == len(oOther._aData)
158        return Vector([ x+y for (x, y) in zip(self._aData, oOther._aData)])
159
160    def __mul__(self, fScale):
161        """Multiply this vector by a scalar."""
162        fScale = float(fScale)
163        return Vector([x*fScale for x in self._aData])
164
165    def __rmul__(self, fScale):
166        """Multiply this vector by a scalar."""
167        fScale = float(fScale)
168        return Vector([x*fScale for x in self._aData])
169
170    def __len__(self):
171        """Length of this vector."""
172        return len(self._aData)
173
174    def __str__(self):
175        """String representation of this vector."""
176        return str(self._aData)
177
178    def __iter__(self):
179        """Iterator."""
180        return iter(self._aData)
181
182
183def k_means_plus_plus(aCards, iNumClust, fDist):
184    """Find a set of initial centers using the k-means++ algorithm.
185
186       See http://www.stanford.edu/~darthur/kMeansPlusPlus.pdf.
187       """
188    aMeans = [ random.choice(aCards) ]
189
190    while len(aMeans) < iNumClust:
191        aDists = []
192        for oVec in aCards:
193            fMinD = min((fDist(oVec, oMean) for oMean in aMeans))**2
194            aDists.append(fMinD)
195
196        fSumSq = sum(aDists)
197        fPick = random.uniform(0, fSumSq)
198
199        for iCard, fMinD in enumerate(aDists):
200            fPick -= fMinD
201            if fPick <= 0:
202                break
203
204        # iCard is defined because k_means doesn't call
205        # this unless aCards is non-empty
206        # pylint: disable-msg=W0631
207
208        if iCard == len(aCards):
209            # guard against slight possibility of being very close
210            # to end of fSumSq.
211            iCard -= 1
212
213        aMeans.append(aCards[iCard])
214
215    return aMeans
216
217def k_means(aCards, iNumClust, iIterations, fDist):
218    """Perform k-means clustering on a list of cards using Lloyd's
219       algorithm."""
220    if (not aCards) or (not aCards[0]):
221        # empty card set or zero-length vectors
222        return [], []
223
224    aMeans = k_means_plus_plus(aCards, iNumClust, fDist)
225    iCards = len(aCards)
226
227    # just do a fixed number of interations (no complex stopping condition)
228    for _iIter in range(iIterations):
229        # empty clusters
230        aClusters = []
231        for iClust in xrange(iNumClust):
232            aClusters.append([])
233
234        # calculate membership in clusters
235        for iCard in xrange(iCards):
236            oVec = aCards[iCard]
237            iVmin = min(xrange(iNumClust),
238                key=lambda iV: fDist(oVec, aMeans[iV]))
239            aClusters[iVmin].append(iCard)
240
241        # recompute the centroids
242        for iClust in xrange(iNumClust):
243            if aClusters[iClust]:
244                aMeans[iClust] = (1.0 / len(aClusters[iClust])) * sum(
245                    (aCards[x] for x in aClusters[iClust])
246                )
247
248    return aMeans, aClusters
249
250
251def plotrevs(users, paths, data, is_date=False, linestyle="o-"):
252    """Plot a list of revision objects"""
253    import matplotlib.pyplot as plt
254    import pylab
255
256    common = len(os.path.commonprefix(paths))
257    paths = [p[common:] for p in paths]
258    for user, points in data.items():
259        data[user] = [(d[0], d[1][common:]) for d in points]
260
261    pathdict = dict(zip(paths, range(len(paths))))
262    userdict = dict(zip(users, range(len(users))))
263    usercolor = dict(zip(users, ['r', 'g', 'b', 'c', 'm', 'y']))
264
265    if is_date:
266        plotter = plt.plot_date
267        xgetter = lambda r: r.date
268    else:
269        plotter = plt.plot
270        xgetter = lambda r: r.revid
271
272    for user in users:
273        points = data[user]
274        x = [xgetter(p[0]) for p in points]
275        y = [pathdict[p[1]]+0.1*userdict[user]-0.2 for p in points]
276        plotter(x, y,
277            usercolor[user] + '-',
278            alpha=0.25,
279            linewidth=1.0,
280        )
281        plotter(x, y,
282            usercolor[user] + linestyle,
283            label=user,
284            markersize=8.0,
285            linewidth=1.0,
286            picker=4.0,
287        )
288
289    plt.yticks([y for y in range(len(paths))], paths)
290    plt.grid(color='grey', linestyle='-', alpha=0.25, ydata=[y for y in range(0,len(paths)+1,2)], linewidth=24.0)
291    plt.ylabel('File')
292    plt.xlabel(is_date and 'Date' or 'Revision')
293    plt.ylim(-0.5,len(paths)-0.5)
294    plt.legend(loc=0)
295
296    fig = plt.figure(1)
297    xmin, xmax = plt.xlim()
298    ymin, ymax = plt.ylim()
299    ann = plt.annotate("default", (xmin + (xmax-xmin)*0.05, ymin + (ymax-ymin)*0.05),
300        backgroundcolor="yellow",
301        bbox=dict(boxstyle="round", fc="0.8"),
302        visible=False,
303    )
304    #ann.set_animated(True)
305    current_rev = [-1]
306
307    def onpick(event):
308        line = event.artist
309        ind = event.ind[0]
310        x, y = line.get_xdata()[ind], line.get_ydata()[ind]
311        user = line.get_label()
312        rev = data[user][ind][0]
313        comment = list(rev.comments[user])[0]
314        text = "%s [%s]: %s" % (user, rev.revid, comment)
315        text = text[:150]
316
317        if ann.get_visible() and current_rev[0] == rev.revid:
318            # print "II:", rev.revid
319            ann.set_visible(False)
320            current_rev[0] = -1
321        else:
322            # print "VV:", rev.revid
323            ann.set_text(text)
324            ann.set_visible(True)
325            current_rev[0] = rev.revid
326
327        pylab.draw()
328
329    cid = fig.canvas.mpl_connect('pick_event', onpick)
330
331    plt.show()
332
333def minimize_rev_separation(revs, paths):
334    """Attempt to minimize the maximum distance between paths in
335       a revision.
336       """
337    import numpy
338
339    pathindex = dict(zip(paths, range(len(paths))))
340    revindex = []
341    userindex = {}
342    for rev in revs:
343        for r_user, r_paths in rev.contributions.items():
344            idxs = [pathindex[r] for r in r_paths if r in pathindex]
345            if idxs:
346                revindex.append(idxs)
347                userindex.setdefault(r_user,[])
348                userindex[r_user].extend(idxs)
349    userindex = userindex.values()
350
351    allindex = userindex
352
353    def func(x):
354        tot = 0.0
355        for idxs in allindex:
356            places = x.take(idxs)
357            tot += sum((places[1:] - places[:-1])**2)
358        return tot
359
360    x0 = numpy.array(range(len(paths)), dtype=float)
361    best_tot = func(x0)
362    path_min = x0.copy()
363    logging.info("Initial objective function value: %s", best_tot)
364    for i in range(1000):
365        random.shuffle(x0)
366        tot = func(x0)
367        if tot < best_tot:
368            best_tot = tot
369            path_min = x0.copy()
370    logging.info("Final objective function value:", best_tot)
371
372    path_pairs = zip(paths, path_min)
373    path_pairs.sort(key=lambda p: p[1])
374    return [p[0] for p in path_pairs]
375
376def main(args):
377
378    parser = optparse.OptionParser(version="%prog 0.1")
379    parser.add_option("-d", "--bydate", dest="bydate",
380                      action="store_true",
381                      help="plot by date rather than revision id.")
382    parser.add_option("-l", "--linestyle", dest="linestyle",
383                      default="o-",
384                      help="matplotlib line format")
385    parser.add_option("-s", "--seed", dest="seed",
386                      type="int",
387                      default=42,
388                      help="set random module seed.")
389    parser.add_option("-f", "--filter", dest="filter",
390                      default=r".*\.py$",
391                      help="regular expression filter for file names")
392
393    (options, args) = parser.parse_args()
394
395    random.seed(options.seed)
396
397    if len(args) < 1:
398        print parser.get_usage()
399        return 1
400
401    logfile = args[0]
402
403    parser = SvnLogParser()
404
405    revs = list(parser.parse(file(logfile)))
406    revs.sort(key=lambda rev: rev.date)
407
408    users, paths = set(), set()
409    data = {}
410    path_vecs = {}
411
412    path_revs = {}
413
414    path_re = re.compile(options.filter)
415
416    for i, rev in enumerate(revs):
417        for r_user, r_paths in rev.contributions.iteritems():
418            r_paths = [p for p in r_paths if path_re.match(p)]
419            users.add(r_user)
420            paths.update(r_paths)
421
422            data.setdefault(r_user, [])
423            data[r_user].extend((rev, p) for p in r_paths)
424
425            for r_path in r_paths:
426                path_revs.setdefault(r_path, [])
427                path_revs[r_path].append(i)
428
429#            for r_path in r_paths:
430#                path_vecs.setdefault(r_path, {})
431#                path_vecs[r_path].setdefault(r_user, 0.0)
432#                path_vecs[r_path][r_user] += 1.0
433
434    users = sorted(users)
435
436    # paths = sorted(paths, key=lambda p: (len(path_revs[p]), max(path_revs[p])))
437    # ordered_paths = paths
438
439    ordered_paths = minimize_rev_separation(revs, paths)
440
441#    path_vecs = path_vecs.items()
442#    path_names = [pv[0] for pv in path_vecs]
443#    path_dicts = [pv[1] for pv in path_vecs]
444#    path_vectors = [Vector([u in d and d[u] or 0.0 for u in users]) for d in path_dicts]
445#    path_vectors = [v*(1.0/sum(v)) for v in path_vectors]
446#
447#    means, clusters = k_means(path_vectors, len(users)+2, 100, Vector.euclidian_distance)
448#
449#    clusters.sort(key=lambda c: len(c))
450#
451#    ordered_paths = []
452#    print "Clusters:"
453#    for cluster in clusters:
454#        print "\t", [path_names[i] for i in cluster]
455#        ordered_paths.extend(path_names[i] for i in cluster)
456
457    plotrevs(users, ordered_paths, data, options.bydate, options.linestyle)
458
459
460if __name__ == "__main__":
461    import sys
462    sys.exit(main(sys.argv))
463
Note: See TracBrowser for help on using the browser.