1515# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
1616# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
1717#
18- #$Id: blobfiles.py,v 1.23 2007-10-26 06:52:26 richard Exp $
18+ #$Id: blobfiles.py,v 1.24 2008-02-07 00:57:59 richard Exp $
1919'''This module exports file storage for roundup backends.
2020Files are stored into a directory hierarchy.
2121'''
@@ -36,8 +36,179 @@ def files_in_dir(dir):
3636 return num_files
3737
3838class FileStorage :
39- """Store files in some directory structure"""
39+ """Store files in some directory structure
4040
41+ Some databases do not permit the storage of arbitrary data (i.e.,
42+ file content). And, some database schema explicitly store file
43+ content in the fielsystem. In particular, if a class defines a
44+ 'filename' property, it is assumed that the data is stored in the
45+ indicated file, outside of whatever database Roundup is otherwise
46+ using.
47+
48+ In these situations, it is difficult to maintain the transactional
49+ abstractions used elsewhere in Roundup. In particular, if a
50+ file's content is edited, but then the containing transaction is
51+ not committed, we do not want to commit the edit. Similarly, we
52+ would like to guarantee that if a transaction is committed to the
53+ database, then the edit has in fact taken place.
54+
55+ This class provides an approximation of these transactional
56+ requirements.
57+
58+ For classes that do not have a 'filename' property, the file name
59+ used to store the file's content is a deterministic function of
60+ the classname and nodeid for the file. The 'filename' function
61+ computes this name. The name will contain directories and
62+ subdirectories, but, suppose, for the purposes of what follows,
63+ that the filename is 'file'.
64+
65+ Edit Procotol
66+ -------------
67+
68+ When a file is created or edited, the following protocol is used:
69+
70+ 1. The new content of the file is placed in 'file.tmp'.
71+
72+ 2. A transaction is recored in 'self.transactions' referencing the
73+ 'doStoreFile' method of this class.
74+
75+ 3. At some subsequent point, the database 'commit' function is
76+ called. This function first performs a traditional database
77+ commit (for example, by issuing a SQL command to commit the
78+ current transaction), and, then, runs the transactions recored
79+ in 'self.transactions'.
80+
81+ 4. The 'doStoreFile' method renames the 'file.tmp' to 'file'.
82+
83+ If Step 3 never occurs, but, instead, the database 'rollback'
84+ method is called, then that method, after rolling back the
85+ database transaction, calls 'rollbackStoreFile', which removes
86+ 'file.tmp'.
87+
88+ Race Condition
89+ --------------
90+
91+ If two Roundup instances (say, the mail gateway and a web client,
92+ or two web clients running with a multi-process server) attempt
93+ edits at the same time, both will write to 'file.tmp', and the
94+ results will be indeterminate.
95+
96+ Crash Analysis
97+ --------------
98+
99+ There are several situations that may occur if a crash (whether
100+ because the machine crashes, because an unhandled Python exception
101+ is raised, or because the Python process is killed) occurs.
102+
103+ Complexity ensues because backuping up an RDBMS is generally more
104+ complex than simply copying a file. Instead, some command is run
105+ which stores a snapshot of the database in a file. So, if you
106+ back up the database to a file, and then back up the filesystem,
107+ it is likely that further database transactions have occurred
108+ between the point of database backup and the point of filesystem
109+ backup.
110+
111+ For the purposes, of this analysis, we assume that the filesystem
112+ backup occurred after the database backup. Furthermore, we assume
113+ that filesystem backups are atomic; i.e., the at the filesystem is
114+ not being modified during the backup.
115+
116+ 1. Neither the 'commit' nor 'rollback' methods on the database are
117+ ever called.
118+
119+ In this case, the '.tmp' file should be ignored as the
120+ transaction was not committed.
121+
122+ 2. The 'commit' method is called. Subsequently, the machine
123+ crashes, and is restored from backups.
124+
125+ The most recent filesystem backup and the most recent database
126+ backup are not in general from the same instant in time.
127+
128+ This problem means that we can never be sure after a crash if
129+ the contents of a file are what we intend. It is always
130+ possible that an edit was made to the file that is not
131+ reflected in the filesystem.
132+
133+ 3. A crash occurs between the point of the database commit and the
134+ call to 'doStoreFile'.
135+
136+ If only one of 'file' and 'file.tmp' exists, then that
137+ version should be used. However, if both 'file' and 'file.tmp'
138+ exist, there is no way to know which version to use.
139+
140+ Reading the File
141+ ----------------
142+
143+ When determining the content of the file, we use the following
144+ algorithm:
145+
146+ 1. If 'self.transactions' reflects an edit of the file, then use
147+ 'file.tmp'.
148+
149+ We know that an edit to the file is in process so 'file.tmp' is
150+ the right choice. If 'file.tmp' does not exist, raise an
151+ exception; something has removed the content of the file while
152+ we are in the process of editing it.
153+
154+ 2. Otherwise, if 'file.tmp' exists, and 'file' does not, use
155+ 'file.tmp'.
156+
157+ We know that the file is supposed to exist because there is a
158+ reference to it in the database. Since 'file' does not exist,
159+ we assume that Crash 3 occurred during the initial creation of
160+ the file.
161+
162+ 3. Otherwise, use 'file'.
163+
164+ If 'file.tmp' is not present, this is obviously the best we can
165+ do. This is always the right answer unless Crash 2 occurred,
166+ in which case the contents of 'file' may be newer than they
167+ were at the point of database backup.
168+
169+ If 'file.tmp' is present, we know that we are not actively
170+ editing the file. The possibilities are:
171+
172+ a. Crash 1 has occurred. In this case, using 'file' is the
173+ right answer, so we will have chosen correctly.
174+
175+ b. Crash 3 has occurred. In this case, 'file.tmp' is the right
176+ answer, so we will have chosen incorrectly. However, 'file'
177+ was at least a previously committed value.
178+
179+ Future Improvements
180+ -------------------
181+
182+ One approach would be to take advantage of databases which do
183+ allow the storage of arbitary date. For example, MySQL provides
184+ the HUGE BLOB datatype for storing up to 4GB of data.
185+
186+ Another approach would be to store a version ('v') in the actual
187+ database and name files 'file.v'. Then, the editing protocol
188+ would become:
189+
190+ 1. Generate a new version 'v', guaranteed to be different from all
191+ other versions ever used by the database. (The version need
192+ not be in any particular sequence; a UUID would be fine.)
193+
194+ 2. Store the content in 'file.v'.
195+
196+ 3. Update the database to indicate that the version of the node is
197+ 'v'.
198+
199+ Now, if the transaction is committed, the database will refer to
200+ 'file.v', where the content exists. If the transaction is rolled
201+ back, or not committed, 'file.v' will never be referenced. In the
202+ event of a crash, under the assumptions above, there may be
203+ 'file.v' files that are not referenced by the database, but the
204+ database will be consistent, so long as unreferenced 'file.v'
205+ files are never removed until after the database has been backed
206+ up.
207+ """
208+
209+ tempext = '.tmp'
210+ """The suffix added to files indicating that they are uncommitted."""
211+
41212 def __init__ (self , umask ):
42213 self .umask = umask
43214
@@ -54,6 +225,13 @@ def subdirFilename(self, classname, nodeid, property=None):
54225 subdir = str (int (nodeid ) / 1000 )
55226 return os .path .join (subdir , name )
56227
228+ def _tempfile (self , filename ):
229+ """Return a temporary filename.
230+
231+ 'filename' -- The name of the eventual destination file."""
232+
233+ return filename + self .tempext
234+
57235 def filename (self , classname , nodeid , property = None , create = 0 ):
58236 '''Determine what the filename for the given node and optionally
59237 property is.
@@ -64,14 +242,45 @@ def filename(self, classname, nodeid, property=None, create=0):
64242 '''
65243 filename = os .path .join (self .dir , 'files' , classname ,
66244 self .subdirFilename (classname , nodeid , property ))
67- if create or os .path .exists (filename ):
245+ # If the caller is going to create the file, return the
246+ # post-commit filename. It is the callers responsibility to
247+ # add self.tempext when actually creating the file.
248+ if create :
68249 return filename
69250
70- # try .tmp
71- filename = filename + '.tmp'
251+ tempfile = self ._tempfile (filename )
252+
253+ # If an edit to this file is in progress, then return the name
254+ # of the temporary file containing the edited content.
255+ for method , args in self .transactions :
256+ if (method == self .doStoreFile and
257+ args == (classname , nodeid , property )):
258+ # There is an edit in progress for this file.
259+ if not os .path .exists (tempfile ):
260+ raise IOError ('content file for %s not found' % tempfile )
261+ return tempfile
262+
72263 if os .path .exists (filename ):
73264 return filename
74265
266+ # Otherwise, if the temporary file exists, then the probable
267+ # explanation is that a crash occurred between the point that
268+ # the database entry recording the creation of the file
269+ # occured and the point at which the file was renamed from the
270+ # temporary name to the final name.
271+ if os .path .exists (tempfile ):
272+ try :
273+ # Clean up, by performing the commit now.
274+ os .rename (tempfile , filename )
275+ except :
276+ pass
277+ # If two Roundup clients both try to rename the file
278+ # at the same time, only one of them will succeed.
279+ # So, tolerate such an error -- but no other.
280+ if not os .path .exists (filename ):
281+ raise IOError ('content file for %s not found' % filename )
282+ return filename
283+
75284 # ok, try flat (very old-style)
76285 if property :
77286 filename = os .path .join (self .dir , 'files' , '%s%s.%s' % (classname ,
@@ -98,7 +307,7 @@ def storefile(self, classname, nodeid, property, content):
98307 os .makedirs (os .path .dirname (name ))
99308
100309 # save to a temp file
101- name = name + '.tmp'
310+ name = self . _tempfile ( name )
102311
103312 # make sure we don't register the rename action more than once
104313 if not os .path .exists (name ):
@@ -136,13 +345,13 @@ def doStoreFile(self, classname, nodeid, property, **databases):
136345 name = self .filename (classname , nodeid , property , 1 )
137346
138347 # the file is currently ".tmp" - move it to its real name to commit
139- if name .endswith ('.tmp' ):
348+ if name .endswith (self . tempext ):
140349 # creation
141350 dstname = os .path .splitext (name )[0 ]
142351 else :
143352 # edit operation
144353 dstname = name
145- name = name + '.tmp'
354+ name = self . _tempfile ( name )
146355
147356 # content is being updated (and some platforms, eg. win32, won't
148357 # let us rename over the top of the old file)
@@ -159,8 +368,8 @@ def rollbackStoreFile(self, classname, nodeid, property, **databases):
159368 '''
160369 # determine the name of the file to delete
161370 name = self .filename (classname , nodeid , property )
162- if not name .endswith ('.tmp' ):
163- name += '.tmp'
371+ if not name .endswith (self . tempext ):
372+ name += self . tempext
164373 os .remove (name )
165374
166375 def isStoreFile (self , classname , nodeid ):
0 commit comments