10067
Comentário:
|
10070
|
Deleções são marcadas assim. | Adições são marcadas assim. |
Linha 17: | Linha 17: |
DVD Split, by Marcelo Barros (barros@smar.com.br) | DVD Split, by Marcelo Barros (barros(at)smar.com.br) |
Receita: Dividir Arquivos Em Vários Cds Ou Dvds
Este script permite que uma série de arquivos sejam organizados em vários DVDs/CDs de forma otimizada, isto é, tentando sempre utilizar ao máximo o espaço disponível na mídia. É útil quando você não quer pensar muito na melhor organização e eu venho usado para organizar os meus arquivos de backup. Ao final, ele lista uma opção de organização. Como o algoritmo é estatístico, você pode rodar novamente para ter um resultado diferente, apesar de construído com os mesmos critérios.
Este programa usa uma rede neural do tipo Hopfield para a solução do problema. Basicamente é necessário especificar o diretório onde os arquivos estão, o tamanho da mídia e a folga máxima. Se nenhum parâmetro é especificado, é usado o diretório corrente, uma mídia de 4500MB (DVD) e uma folga de 50MB. A folga diz o quanto o algoritmo pode encarar margem de erro, já que em geral não é possível ter uma divisão sempre perfeita.
Rode com dvdsplit.py -h para informações da linha de comando. É meu primeiro programa em Python, comentário são bem vindos.
Código
1 """
2 DVD Split, by Marcelo Barros (barros(at)smar.com.br)
3 Sept/13/2005
4 $Id: dvdsplit.py,v 1.1 2005/09/13 14:54:36 barros Exp $
5 """
6 import random
7 import math
8 import os
9 import sys
10 from optparse import OptionParser
11 import psyco
12
13 def currentSize(files, selected):
14 """
15 Calculate the size of the current selection.
16
17 Parameters:
18 - files: list with file sizes
19 - selected: list with 1 or -1, indicating if the file was selected or not
20
21 Returns:
22 - the current selection size
23 """
24 sz = 0
25 for i in range(len(files)):
26 if(selected[i] > 0):
27 sz = sz + files[i]
28 return sz
29
30 def energyFun(files,selected,dvdsz):
31 """
32 Energy function used. In this case is:
33 f(x) = |current_size - dvdsize |
34
35 Parameters:
36 - files: list with file sizes
37 - selected: list with 1 or -1, indicating if the file was selected or not
38 - dvdsz: dvd size
39
40 Returns:
41 - energy function value for the current selection
42 """
43 energ = 0
44 for i in range(len(files)):
45 if selected[i] > 0:
46 energ = energ + files[i]
47 energ = abs(energ - dvdsz)
48 return energ
49
50 def dvdInitialGuess(files,selected,dvdsz):
51 """
52 Just provide an initial random guess to the algorithm
53
54 Parameters:
55 - files: list with file sizes
56 - selected: list with 1 or -1, indicating if the file was selected or not
57 - dvdsz: dvd size
58 """
59 l = len(files)
60 idx = range(l)
61 idx = random.sample(idx,l)
62
63 t = 0
64 for i in range(l):
65 j = idx[i]
66 t = t + files[j]
67 selected[j] = 1
68 if t >= dvdsz:
69 break
70
71 def dvdSplit(files,dvdsz,slack=50,itermax=3000,beta=0.005):
72 """
73 Given a list with file sizes, the dvd size and the acceptable slack
74 this routine gives a subset that fits in one dvd.
75
76 Parameters:
77 - files: list with file sizes
78 - dvdsz: dvd size (single side)
79 - slack: maximum acceptable error when calculating the selection
80 - itermax: maximum number of iteration (default=2000)
81 - beta: sigmoid parameter (default 0.005, empirically determined)
82
83 Returns:
84 - indexes of selected files: [ idx1, idx2, ..., idxn ]
85 - indexes of not selected files: [ idx1, idx2, ..., idxn ]
86 - current selection size
87 - iteration used to calculate
88 """
89 running = True
90 iter = 0
91 numfiles = len(files)
92 selected = [-1 for x in range(numfiles)]
93 dvdInitialGuess(files,selected,dvdsz)
94
95 sz = currentSize(files,selected) # maximum size must be greather than dvdsize
96
97 if(sz > dvdsz):
98 while(running and iter < itermax):
99 # current energy
100 e1 = energyFun(files,selected,dvdsz)
101 # change state for random node
102 n = random.randint(0,numfiles-1)
103 selected[n] = -selected[n]
104 # new energy
105 e2 = energyFun(files,selected,dvdsz)
106 # evaluate the change against sigmoid and accept it or not
107 # according to a uniform probability
108 sig = (1/( 1 + math.exp(beta*(e2-e1))))
109 if sig < random.uniform(0,1):
110 selected[n] = -selected[n] # state not accepted
111 sz = currentSize(files,selected)
112 # testing exit condition: (dvdsz - slack) < currentSize < dvdsz
113 if ( (sz >= (dvdsz - slack)) and (sz < dvdsz) ):
114 running = False
115 iter = iter + 1
116
117 sel = []
118 notsel = []
119 for i in range(numfiles):
120 if selected[i] > 0:
121 sel.append(i)
122 else:
123 notsel.append(i)
124 #print "Solution found in %d iterations. Compilation size is %d (%d files)" % (iter,sz,len(sel))
125 return sel,notsel,sz,iter
126
127 def dvdsSplit(files,dvdsz,slack=50,itermax=3000,beta=0.005):
128 """
129 Given a list with file sizes, the dvd size and the acceptable slack
130 this routine distributes the selection over several dvs.
131
132 Parameters:
133 - files: list with file sizes
134 - dvdsz: dvd size (single side)
135 - slack: maximum acceptable error when calculating the selection (default=50)
136 - itermax: maximum number of iteration (default=2000)
137 - beta: sigmoid parameter (default 0.005, empirically determined)
138
139 Returns:
140 - indexes of selected files per dvds:
141 [ [idx1, idx2, ..., idxn], [idx1, idx2, ..., idxm], ..., [idx1, idx2, ..., idxk] ]
142 - compilation sizes: [ size1, size2, ..., sizen ]
143 """
144 # first create a dictionary with all file sizes indexed. Files choosen
145 # by dvdSplit will be removed from this dictionary until it becomes empty
146 filesdic = dict()
147 n = len(files)
148 for i in range(n):
149 filesdic[i] = files[i]
150 dvds = []
151 sz = []
152 running = True
153 r = 1
154 while(running):
155 v = filesdic.values()
156 k = filesdic.keys()
157 print "Starting DVD %02d [%3.2f%% completed]" % (r,100-100*len(v)/n)
158 maxtries = 50
159 while(True):
160 sel,ns,dsz,iter = dvdSplit(v,dvdsz,slack,itermax,beta)
161 if dsz < dvdsz:
162 break;
163 maxtries = maxtries - 1
164 if(maxtries == 0):
165 print "Could not find a good solution for DVD %d in 50 attempts. Try again, please." % (r)
166 sys.exit(1)
167 # getting indexes
168 r = r + 1
169 d = []
170 for s in sel:
171 d.append(k[s])
172 del filesdic[k[s]]
173 # saving dvd compilation
174 dvds.append(d)
175 sz.append(dsz)
176 if len(ns) == 0:
177 running = False
178
179 return dvds, sz
180
181 def dvdConsistency(filenames,filesizes,dvdsz):
182 """
183 Consistency checks.
184
185 Parameters:
186 - filenames: file names dict
187 - filesizes: file sizes dict
188 - dvds: dvd size
189 """
190 # look for any file bigger than dvd side
191 for i in filesizes:
192 if filesizes[i] >= dvdsz:
193 print "Size %d is bigger than dvd size %d" % (filesizes[i],dvdsz)
194 sys.exit(1)
195
196 def printDvdLayout(filesizes,filenames,dvdsz,dvds,s):
197 """
198 Print results
199 """
200 i = 0
201 n = len(dvds)
202 tts = []
203 print "\nSummary:"
204 for dvd in dvds:
205 tt = []
206 i = i + 1
207 for track in dvd:
208 tt.append(filesizes[track])
209 tts.append(sum(tt))
210 print "[DVD %d/%d, SIZE %dMBs, USAGE %2.2f%%]" % (i,n,tts[i-1],100.0*tts[i-1]/dvdsz)
211
212 i = 0
213 print "\nFile seletion per DVD:"
214 for dvd in dvds:
215 i = i + 1
216 print "[DVD %d/%d, SIZE %dMBs]" % (i,n,tts[i-1])
217 for track in dvd:
218 files = filenames[track].split(',')
219 files.sort()
220 for f in files:
221 print "\t%s" % (f)
222
223 def buildDvds(basedir,dvdsz,slack):
224 """
225 Get the files list and call optimization routines
226 """
227 filelist = os.listdir(basedir)
228 filenames = dict()
229 filesizes = dict()
230 i = 0
231 print "\nFiles at base directory %s (DVD size = %d, slack = %d)" % (basedir,dvdsz,slack)
232 for f in filelist:
233 filename = basedir+f
234 if os.path.isfile(filename):
235 filenames[i] = f
236 # in MB, rounded up
237 filesizes[i] = int(os.path.getsize(filename)/1024.0**2 + 1)
238 print '[%5.2fMB ] %s' % (filesizes[i],filenames[i])
239 i = i + 1
240
241 print "\nChecking consistency ..."
242 dvdConsistency(filenames,filesizes,dvdsz)
243 dvds, s = dvdsSplit(filesizes.values(),dvdsz,slack)
244 printDvdLayout(filesizes,filenames,dvdsz,dvds,s)
245
246 def main():
247
248 usage = "Usage: %prog [options]"
249 cmdline = OptionParser(usage)
250 cmdline.add_option("-d", "--dir", action="store", dest="basedir", type="string", default=".", metavar="DIR", help="Directory where files are stored")
251 cmdline.add_option("-s", "--size", action="store", dest="dvdsz", type="int", default="4500", metavar="SIZE", help="DVD size")
252 cmdline.add_option("-l", "--slack",action="store", dest="slack", type="int", default="50", metavar="SLACK", help="Maximum acceptable error")
253
254 (options, args) = cmdline.parse_args()
255
256 if(os.path.isdir(options.basedir) == False):
257 print "%s is not a valid directory" %(options.basedir)
258 sys.exit(1)
259
260 options.basedir.strip()
261 basedir = os.path.normpath(options.basedir)
262 if(os.path.isdir("c:\\")):
263 basedir = basedir + "\\"
264 else:
265 basedir = basedir + "//"
266
267 buildDvdsPrx = psyco.proxy(buildDvds)
268 buildDvdsPrx(basedir,int(options.dvdsz),int(options.slack))
269
270 if __name__ == "__main__":
271
272 main()
Volta para CookBook.
Marcelo Barros