associação pythonbrasil[11] django zope/plone planet Início Logado como (Entrar)

Revisão 10e 2008-09-26 14:06:34

Excluir mensagem

DividirArquivosEmVariosCdOuDvd

MarceloBarros

Este script permite que uma série de arquivos sejam organizados em vários DVDs/CDs de forma otimizada, isto é, tentando sempre utilizar ao máximo o espaço disponível na mídia. É útil quando você não quer pensar muito na melhor organização e eu venho usado para organizar os meus arquivos de backup. Ao final, ele lista uma opção de organização. Como o algoritmo é estatístico, você pode rodar novamente para ter um resultado diferente, apesar de construído com os mesmos critérios.

Este programa usa uma rede neural do tipo Hopfield para a solução do problema. Basicamente é necessário especificar o diretório onde os arquivos estão, o tamanho da mídia e a folga máxima. Se nenhum parâmetro é especificado, é usado o diretório corrente, uma mídia de 4500MB (DVD) e uma folga de 50MB. A folga diz o quanto o algoritmo pode encarar margem de erro, já que em geral não é possível ter uma divisão sempre perfeita.

Rode com dvdsplit.py -h para informações da linha de comando. É meu primeiro programa em Python, comentário são bem vindos.

Código

   1 """
   2 DVD Split, by Marcelo Barros (marcelobarrosalmeida(at)yahoo.com.br)
   3 Sept/13/2005
   4 $Id: dvdsplit.py,v 1.1 2005/09/13 14:54:36 barros Exp $
   5 """
   6 import random
   7 import math
   8 import os
   9 import sys
  10 from optparse import OptionParser
  11 import psyco
  12 
  13 def currentSize(files, selected):
  14     """
  15     Calculate the size of the current selection.
  16     
  17     Parameters:
  18     - files: list with file sizes
  19     - selected: list with 1 or -1, indicating if the file was selected or not
  20 
  21     Returns:
  22     - the current selection size
  23     """
  24     sz = 0
  25     for i in range(len(files)):
  26         if(selected[i] > 0):
  27             sz = sz + files[i]
  28     return sz
  29 
  30 def energyFun(files,selected,dvdsz):
  31     """
  32     Energy function used. In this case is:
  33     f(x) = |current_size - dvdsize |
  34 
  35     Parameters:
  36     - files: list with file sizes
  37     - selected: list with 1 or -1, indicating if the file was selected or not
  38     - dvdsz: dvd size
  39 
  40     Returns:
  41     - energy function value for the current selection
  42     """
  43     energ = 0
  44     for i in range(len(files)):
  45         if selected[i] > 0:
  46             energ = energ + files[i]
  47     energ = abs(energ - dvdsz)
  48     return energ
  49 
  50 def dvdInitialGuess(files,selected,dvdsz):
  51     """
  52     Just provide an initial random guess to the algorithm
  53 
  54     Parameters:
  55     - files: list with file sizes
  56     - selected: list with 1 or -1, indicating if the file was selected or not
  57     - dvdsz: dvd size    
  58     """
  59     l = len(files)
  60     idx = range(l)
  61     idx = random.sample(idx,l)
  62 
  63     t = 0
  64     for i in range(l):
  65         j = idx[i]
  66         t = t + files[j]
  67         selected[j] = 1
  68         if t >= dvdsz:
  69             break
  70     
  71 def dvdSplit(files,dvdsz,slack=50,itermax=3000,beta=0.005):
  72     """
  73     Given a list with file sizes, the dvd size and the acceptable slack
  74     this routine gives a subset that fits in one dvd.  
  75 
  76     Parameters:
  77     - files: list with file sizes
  78     - dvdsz: dvd size (single side)
  79     - slack: maximum acceptable error when calculating the selection
  80     - itermax: maximum number of iteration (default=2000)
  81     - beta: sigmoid parameter (default 0.005, empirically determined)
  82 
  83     Returns:
  84     - indexes of selected files: [ idx1, idx2, ..., idxn ]
  85     - indexes of not selected files: [ idx1, idx2, ..., idxn ]
  86     - current selection size
  87     - iteration used to calculate
  88     """
  89     running = True
  90     iter = 0
  91     numfiles = len(files)
  92     selected = [-1 for x in range(numfiles)]
  93     dvdInitialGuess(files,selected,dvdsz)
  94     
  95     sz = currentSize(files,selected) # maximum size must be greather than dvdsize
  96 
  97     if(sz > dvdsz):
  98         while(running and iter < itermax):
  99             # current energy 
 100             e1 = energyFun(files,selected,dvdsz)
 101             # change state for random node
 102             n = random.randint(0,numfiles-1)
 103             selected[n] = -selected[n]
 104             # new energy
 105             e2 = energyFun(files,selected,dvdsz)
 106             # evaluate the change against sigmoid and accept it or not
 107             # according to a uniform probability
 108             sig = (1/( 1 + math.exp(beta*(e2-e1))))
 109             if sig < random.uniform(0,1):
 110                 selected[n] = -selected[n] # state not accepted
 111             sz = currentSize(files,selected)
 112             # testing exit condition: (dvdsz - slack) < currentSize < dvdsz
 113             if ( (sz >= (dvdsz - slack)) and (sz < dvdsz) ):
 114                 running = False
 115             iter = iter + 1
 116         
 117     sel = []
 118     notsel = []
 119     for i in range(numfiles):
 120         if selected[i] > 0:
 121             sel.append(i)
 122         else:
 123             notsel.append(i)
 124     #print "Solution found in %d iterations. Compilation size is %d (%d files)" % (iter,sz,len(sel))
 125     return sel,notsel,sz,iter
 126 
 127 def dvdsSplit(files,dvdsz,slack=50,itermax=3000,beta=0.005):
 128     """
 129     Given a list with file sizes, the dvd size and the acceptable slack
 130     this routine distributes the selection over several dvs.
 131 
 132     Parameters:
 133     - files: list with file sizes
 134     - dvdsz: dvd size (single side)
 135     - slack: maximum acceptable error when calculating the selection (default=50)
 136     - itermax: maximum number of iteration (default=2000)
 137     - beta: sigmoid parameter (default 0.005, empirically determined)
 138 
 139     Returns:
 140     - indexes of selected files per dvds:
 141       [ [idx1, idx2, ..., idxn], [idx1, idx2, ..., idxm], ..., [idx1, idx2, ..., idxk] ]
 142     - compilation sizes: [ size1, size2, ..., sizen ]      
 143     """
 144     # first create a dictionary with all file sizes indexed. Files choosen
 145     # by dvdSplit will be removed from this dictionary until it becomes empty
 146     filesdic = dict()
 147     n = len(files)
 148     for i in range(n):
 149         filesdic[i] = files[i]
 150     dvds = []
 151     sz = []
 152     running = True
 153     r = 1
 154     while(running):    
 155         v = filesdic.values()
 156         k = filesdic.keys()
 157         print "Starting DVD %02d [%3.2f%% completed]" % (r,100-100*len(v)/n)
 158         maxtries = 50
 159         while(True):
 160             sel,ns,dsz,iter = dvdSplit(v,dvdsz,slack,itermax,beta)
 161             if dsz < dvdsz:
 162                 break;
 163             maxtries = maxtries - 1
 164             if(maxtries == 0):
 165                 print "Could not find a good solution for DVD %d in 50 attempts. Try again, please." % (r)
 166                 sys.exit(1)
 167         # getting indexes
 168         r = r + 1
 169         d = []
 170         for s in sel: 
 171             d.append(k[s])
 172             del filesdic[k[s]]
 173         # saving dvd compilation
 174         dvds.append(d)
 175         sz.append(dsz)
 176         if len(ns) == 0:
 177             running = False
 178 
 179     return dvds, sz
 180 
 181 def dvdConsistency(filenames,filesizes,dvdsz):
 182     """
 183     Consistency checks.
 184 
 185     Parameters:
 186     - filenames: file names dict
 187     - filesizes: file sizes dict
 188     - dvds: dvd size
 189     """
 190     # look for any file bigger than dvd side
 191     for i in filesizes:
 192         if filesizes[i] >= dvdsz:
 193             print "Size %d is bigger than dvd size %d" % (filesizes[i],dvdsz)
 194             sys.exit(1)
 195 
 196 def printDvdLayout(filesizes,filenames,dvdsz,dvds,s):
 197     """
 198     Print results
 199     """
 200     i = 0
 201     n = len(dvds)    
 202     tts = []
 203     print "\nSummary:" 
 204     for dvd in dvds:
 205         tt = []
 206         i = i + 1
 207         for track in dvd:
 208             tt.append(filesizes[track])
 209         tts.append(sum(tt))
 210         print "[DVD %d/%d, SIZE %dMBs, USAGE %2.2f%%]" % (i,n,tts[i-1],100.0*tts[i-1]/dvdsz)
 211             
 212     i = 0
 213     print "\nFile seletion per DVD:" 
 214     for dvd in dvds:
 215         i = i + 1
 216         print "[DVD %d/%d, SIZE %dMBs]" % (i,n,tts[i-1])
 217         for track in dvd:
 218             files = filenames[track].split(',')
 219             files.sort()
 220             for f in files:
 221                 print "\t%s" % (f)
 222         
 223 def buildDvds(basedir,dvdsz,slack):
 224     """
 225     Get the files list and call optimization routines
 226     """
 227     filelist = os.listdir(basedir)
 228     filenames = dict()
 229     filesizes = dict()
 230     i = 0
 231     print "\nFiles at base directory %s (DVD size = %d, slack = %d)" % (basedir,dvdsz,slack)
 232     for f in filelist:
 233         filename = basedir+f
 234         if os.path.isfile(filename):
 235             filenames[i] = f
 236             # in MB, rounded up
 237             filesizes[i] = int(os.path.getsize(filename)/1024.0**2 + 1)
 238             print '[%5.2fMB ] %s' % (filesizes[i],filenames[i])
 239             i = i + 1
 240 
 241     print "\nChecking consistency ..."
 242     dvdConsistency(filenames,filesizes,dvdsz)
 243     dvds, s = dvdsSplit(filesizes.values(),dvdsz,slack)
 244     printDvdLayout(filesizes,filenames,dvdsz,dvds,s)
 245 
 246 def main():
 247 
 248     usage = "Usage: %prog [options]"
 249     cmdline = OptionParser(usage)
 250     cmdline.add_option("-d", "--dir",  action="store", dest="basedir", type="string", default=".",    metavar="DIR",      help="Directory where files are stored")
 251     cmdline.add_option("-s", "--size", action="store", dest="dvdsz",   type="int",    default="4500", metavar="SIZE",     help="DVD size")
 252     cmdline.add_option("-l", "--slack",action="store", dest="slack",   type="int",    default="50",   metavar="SLACK",    help="Maximum acceptable error")
 253     
 254     (options, args) = cmdline.parse_args()
 255 
 256     if(os.path.isdir(options.basedir) == False):
 257         print "%s is not a valid directory" %(options.basedir)
 258         sys.exit(1)
 259 
 260     options.basedir.strip()
 261     basedir = os.path.normpath(options.basedir)
 262     if(os.path.isdir("c:\\")):
 263         basedir = basedir + "\\"
 264     else:
 265         basedir = basedir + "//"
 266 
 267     buildDvdsPrx = psyco.proxy(buildDvds)
 268     buildDvdsPrx(basedir,int(options.dvdsz),int(options.slack))
 269     
 270 if __name__ == "__main__":
 271 
 272     main()

Volta para CookBook.


Marcelo Barros