associação pythonbrasil[11] django zope/plone planet Início Logado como (Entrar)

DividirArquivosEmVariosCdOuDvd

MarceloBarros

Este script permite que uma série de arquivos sejam organizados em vários DVDs/CDs de forma otimizada, isto é, tentando sempre utilizar ao máximo o espaço disponível na mídia. É útil quando você não quer pensar muito na melhor organização e eu venho usado para organizar os meus arquivos de backup. Ao final, ele lista uma opção de organização. Como o algoritmo é estatístico, você pode rodar novamente para ter um resultado diferente, apesar de construído com os mesmos critérios.

Este programa usa uma rede neural do tipo Hopfield para a solução do problema. Basicamente é necessário especificar o diretório onde os arquivos estão, o tamanho da mídia e a folga máxima. Se nenhum parâmetro é especificado, é usado o diretório corrente, uma mídia de 4500MB (DVD) e uma folga de 50MB. A folga diz o quanto o algoritmo pode encarar margem de erro, já que em geral não é possível ter uma divisão sempre perfeita.

Rode com dvdsplit.py -h para informações da linha de comando. É meu primeiro programa em Python, comentário são bem vindos.

Código

   1 """
   2 DVD Split, by Marcelo Barros (marcelobarrosalmeida(at)gmail.com)
   3 
   4 March/17/2016 - modifications for python 3, removing psyco dependence (obsolete)
   5 Sept/13/2005 - original version
   6 $Id: dvdsplit.py,v 1.1 2005/09/13 14:54:36 barros Exp $
   7 """
   8 import random
   9 import math
  10 import os
  11 import sys
  12 from optparse import OptionParser
  13 #import psyco
  14 
  15 def currentSize(files, selected):
  16     """
  17     Calculate the size of the current selection.
  18     
  19     Parameters:
  20     - files: list with file sizes
  21     - selected: list with 1 or -1, indicating if the file was selected or not
  22 
  23     Returns:
  24     - the current selection size
  25     """
  26     sz = 0
  27     for i in range(len(files)):
  28         if(selected[i] > 0):
  29             sz = sz + files[i]
  30     return sz
  31 
  32 def energyFun(files,selected,dvdsz):
  33     """
  34     Energy function used. In this case is:
  35     f(x) = |current_size - dvdsize |
  36 
  37     Parameters:
  38     - files: list with file sizes
  39     - selected: list with 1 or -1, indicating if the file was selected or not
  40     - dvdsz: dvd size
  41 
  42     Returns:
  43     - energy function value for the current selection
  44     """
  45     energ = 0
  46     for i in range(len(files)):
  47         if selected[i] > 0:
  48             energ = energ + files[i]
  49     energ = abs(energ - dvdsz)
  50     return energ
  51 
  52 def dvdInitialGuess(files,selected,dvdsz):
  53     """
  54     Just provide an initial random guess to the algorithm
  55 
  56     Parameters:
  57     - files: list with file sizes
  58     - selected: list with 1 or -1, indicating if the file was selected or not
  59     - dvdsz: dvd size    
  60     """
  61     l = len(files)
  62     idx = range(l)
  63     idx = random.sample(idx,l)
  64 
  65     t = 0
  66     for i in range(l):
  67         j = idx[i]
  68         t = t + files[j]
  69         selected[j] = 1
  70         if t >= dvdsz:
  71             break
  72     
  73 def dvdSplit(files,dvdsz,slack=50,itermax=3000,beta=0.005):
  74     """
  75     Given a list with file sizes, the dvd size and the acceptable slack
  76     this routine gives a subset that fits in one dvd.  
  77 
  78     Parameters:
  79     - files: list with file sizes
  80     - dvdsz: dvd size (single side)
  81     - slack: maximum acceptable error when calculating the selection
  82     - itermax: maximum number of iteration (default=2000)
  83     - beta: sigmoid parameter (default 0.005, empirically determined)
  84 
  85     Returns:
  86     - indexes of selected files: [ idx1, idx2, ..., idxn ]
  87     - indexes of not selected files: [ idx1, idx2, ..., idxn ]
  88     - current selection size
  89     - iteration used to calculate
  90     """
  91     running = True
  92     iter = 0
  93     numfiles = len(files)
  94     selected = [-1 for x in range(numfiles)]
  95     dvdInitialGuess(files,selected,dvdsz)
  96     
  97     sz = currentSize(files,selected) # maximum size must be greather than dvdsize
  98 
  99     if(sz > dvdsz):
 100         while(running and iter < itermax):
 101             # current energy 
 102             e1 = energyFun(files,selected,dvdsz)
 103             # change state for random node
 104             n = random.randint(0,numfiles-1)
 105             selected[n] = -selected[n]
 106             # new energy
 107             e2 = energyFun(files,selected,dvdsz)
 108             # evaluate the change against sigmoid and accept it or not
 109             # according to a uniform probability
 110             sig = (1/( 1 + math.exp(beta*(e2-e1))))
 111             if sig < random.uniform(0,1):
 112                 selected[n] = -selected[n] # state not accepted
 113             sz = currentSize(files,selected)
 114             # testing exit condition: (dvdsz - slack) < currentSize < dvdsz
 115             if ( (sz >= (dvdsz - slack)) and (sz < dvdsz) ):
 116                 running = False
 117             iter = iter + 1
 118         
 119     sel = []
 120     notsel = []
 121     for i in range(numfiles):
 122         if selected[i] > 0:
 123             sel.append(i)
 124         else:
 125             notsel.append(i)
 126     #print "Solution found in %d iterations. Compilation size is %d (%d files)" % (iter,sz,len(sel))
 127     return sel,notsel,sz,iter
 128 
 129 def dvdsSplit(files,dvdsz,slack=50,itermax=3000,beta=0.005):
 130     """
 131     Given a list with file sizes, the dvd size and the acceptable slack
 132     this routine distributes the selection over several dvs.
 133 
 134     Parameters:
 135     - files: list with file sizes
 136     - dvdsz: dvd size (single side)
 137     - slack: maximum acceptable error when calculating the selection (default=50)
 138     - itermax: maximum number of iteration (default=2000)
 139     - beta: sigmoid parameter (default 0.005, empirically determined)
 140 
 141     Returns:
 142     - indexes of selected files per dvds:
 143       [ [idx1, idx2, ..., idxn], [idx1, idx2, ..., idxm], ..., [idx1, idx2, ..., idxk] ]
 144     - compilation sizes: [ size1, size2, ..., sizen ]      
 145     """
 146     # first create a dictionary with all file sizes indexed. Files choosen
 147     # by dvdSplit will be removed from this dictionary until it becomes empty
 148     filesdic = dict()
 149     n = len(files)
 150     for i in range(n):
 151         filesdic[i] = files[i]
 152     dvds = []
 153     sz = []
 154     running = True
 155     r = 1
 156     while(running):    
 157         v = list(filesdic.values())
 158         k = list(filesdic.keys())
 159         print("Starting DVD %02d [%3.2f%% completed]" % (r,100-100*len(v)/n))
 160         maxtries = 50
 161         while(True):
 162             sel,ns,dsz,iter = dvdSplit(v,dvdsz,slack,itermax,beta)
 163             if dsz < dvdsz:
 164                 break;
 165             maxtries = maxtries - 1
 166             if(maxtries == 0):
 167                 print("Could not find a good solution for DVD %d in 50 attempts. Try again, please." % (r))
 168                 sys.exit(1)
 169         # getting indexes
 170         r = r + 1
 171         d = []
 172         for s in sel: 
 173             d.append(k[s])
 174             del filesdic[k[s]]
 175         # saving dvd compilation
 176         dvds.append(d)
 177         sz.append(dsz)
 178         if len(ns) == 0:
 179             running = False
 180 
 181     return dvds, sz
 182 
 183 def dvdConsistency(filenames,filesizes,dvdsz):
 184     """
 185     Consistency checks.
 186 
 187     Parameters:
 188     - filenames: file names dict
 189     - filesizes: file sizes dict
 190     - dvds: dvd size
 191     """
 192     # look for any file bigger than dvd side
 193     for i in filesizes:
 194         if filesizes[i] >= dvdsz:
 195             print("Size %d is bigger than dvd size %d" % (filesizes[i],dvdsz))
 196             sys.exit(1)
 197 
 198 def printDvdLayout(filesizes,filenames,dvdsz,dvds,s):
 199     """
 200     Print results
 201     """
 202     i = 0
 203     n = len(dvds)    
 204     tts = []
 205     print("\nSummary:")
 206     for dvd in dvds:
 207         tt = []
 208         i = i + 1
 209         for track in dvd:
 210             tt.append(filesizes[track])
 211         tts.append(sum(tt))
 212         print("[DVD %d/%d, SIZE %dMBs, USAGE %2.2f%%]" % (i,n,tts[i-1],100.0*tts[i-1]/dvdsz))
 213             
 214     i = 0
 215     print("\nFile seletion per DVD:")
 216     for dvd in dvds:
 217         i = i + 1
 218         print("[DVD %d/%d, SIZE %dMBs]" % (i,n,tts[i-1]))
 219         for track in dvd:
 220             files = filenames[track].split(',')
 221             files.sort()
 222             for f in files:
 223                 print("\t%s" % (f))
 224         
 225 def buildDvds(basedir,dvdsz,slack):
 226     """
 227     Get the files list and call optimization routines
 228     """
 229     filelist = os.listdir(basedir)
 230     filenames = dict()
 231     filesizes = dict()
 232     i = 0
 233     print("\nFiles at base directory %s (DVD size = %d, slack = %d)" % (basedir,dvdsz,slack))
 234     for f in filelist:
 235         filename = basedir+f
 236         if os.path.isfile(filename):
 237             filenames[i] = f
 238             # in MB, rounded up
 239             filesizes[i] = int(os.path.getsize(filename)/1024.0**2 + 1)
 240             print("[%5.2fMB ] %s" % (filesizes[i],filenames[i]))
 241             i = i + 1
 242 
 243     print("\nChecking consistency ...")
 244     dvdConsistency(filenames,filesizes,dvdsz)
 245     dvds, s = dvdsSplit(list(filesizes.values()),dvdsz,slack)
 246     printDvdLayout(filesizes,filenames,dvdsz,dvds,s)
 247 
 248 def main():
 249 
 250     usage = "Usage: %prog [options]"
 251     cmdline = OptionParser(usage)
 252     cmdline.add_option("-d", "--dir",  action="store", dest="basedir", type="string", default=".",    metavar="DIR",      help="Directory where files are stored")
 253     cmdline.add_option("-s", "--size", action="store", dest="dvdsz",   type="int",    default="4500", metavar="SIZE",     help="DVD size (MB)")
 254     cmdline.add_option("-l", "--slack",action="store", dest="slack",   type="int",    default="50",   metavar="SLACK",    help="Maximum acceptable error (MB)")
 255     
 256     (options, args) = cmdline.parse_args()
 257 
 258     if(os.path.isdir(options.basedir) == False):
 259         print("%s is not a valid directory" %(options.basedir))
 260         sys.exit(1)
 261 
 262     options.basedir.strip()
 263     basedir = os.path.normpath(options.basedir)
 264     if(os.path.isdir("c:\\")):
 265         basedir = basedir + "\\"
 266     else:
 267         basedir = basedir + "//"
 268 
 269     #buildDvdsPrx = psyco.proxy(buildDvds)
 270     #buildDvdsPrx(basedir,int(options.dvdsz),int(options.slack))
 271     buildDvds(basedir,int(options.dvdsz),int(options.slack))
 272     
 273 if __name__ == "__main__":
 274 
 275     main()

Volta para CookBook.


Marcelo Barros