66import commands
77import random
88
9- from pyres .exceptions import NoQueueError , TimeoutError
9+ from pyres .exceptions import NoQueueError , JobError , TimeoutError , CrashError
1010from pyres .job import Job
1111from pyres import ResQ , Stat , __version__
1212
@@ -128,8 +128,6 @@ def work(self, interval=5):
128128 that job to make sure another worker won't run it, then *forks* itself to
129129 work on that job.
130130
131- Finally, the ``process`` method actually processes the job by eventually calling the Job instance's ``perform`` method.
132-
133131 """
134132 self ._setproctitle ("Starting" )
135133 self .startup ()
@@ -142,63 +140,7 @@ def work(self, interval=5):
142140 job = self .reserve (interval )
143141
144142 if job :
145- logger .debug ('picked up job' )
146- logger .debug ('job details: %s' % job )
147- self .before_fork (job )
148- self .child = os .fork ()
149- if self .child :
150- self ._setproctitle ("Forked %s at %s" %
151- (self .child ,
152- datetime .datetime .now ()))
153- logger .info ('Forked %s at %s' % (self .child ,
154- datetime .datetime .now ()))
155-
156- try :
157- start = datetime .datetime .now ()
158-
159- # waits for the result or times out
160- while True :
161- result = os .waitpid (self .child , os .WNOHANG )
162- if result != (0 , 0 ):
163- break
164- time .sleep (0.5 )
165-
166- now = datetime .datetime .now ()
167- if self .timeout and ((now - start ).seconds > self .timeout ):
168- os .kill (self .child , signal .SIGKILL )
169- os .waitpid (- 1 , os .WNOHANG )
170- raise TimeoutError ("Timed out after %d seconds" % self .timeout )
171-
172- except OSError as ose :
173- import errno
174-
175- if ose .errno != errno .EINTR :
176- raise ose
177-
178- except TimeoutError as e :
179- exceptionType , exceptionValue , exceptionTraceback = sys .exc_info ()
180- logger .exception ("%s timed out: %s" % (job , e ))
181- job .fail (exceptionTraceback )
182- self .failed ()
183- self .done_working ()
184-
185- logger .debug ('done waiting' )
186- else :
187- self ._setproctitle ("Processing %s since %s" %
188- (job ._queue ,
189- datetime .datetime .now ()))
190- logger .info ('Processing %s since %s' %
191- (job ._queue , datetime .datetime .now ()))
192- self .after_fork (job )
193-
194- # re-seed the Python PRNG after forking, otherwise
195- # all job process will share the same sequence of
196- # random numbers
197- random .seed ()
198-
199- self .process (job )
200- os ._exit (0 )
201- self .child = None
143+ self .fork_worker (job )
202144 else :
203145 if interval == 0 :
204146 break
@@ -207,6 +149,81 @@ def work(self, interval=5):
207149 #time.sleep(interval)
208150 self .unregister_worker ()
209151
152+ def fork_worker (self , job ):
153+ """Invoked by ``work`` method. ``fork_worker`` does the actual forking to create the child
154+ process that will process the job. It's also responsible for monitoring the child process
155+ and handling hangs and crashes.
156+
157+ Finally, the ``process`` method actually processes the job by eventually calling the Job
158+ instance's ``perform`` method.
159+
160+ """
161+ logger .debug ('picked up job' )
162+ logger .debug ('job details: %s' % job )
163+ self .before_fork (job )
164+ self .child = os .fork ()
165+ if self .child :
166+ self ._setproctitle ("Forked %s at %s" %
167+ (self .child ,
168+ datetime .datetime .now ()))
169+ logger .info ('Forked %s at %s' % (self .child ,
170+ datetime .datetime .now ()))
171+
172+ try :
173+ start = datetime .datetime .now ()
174+
175+ # waits for the result or times out
176+ while True :
177+ pid , status = os .waitpid (self .child , os .WNOHANG )
178+ if pid != 0 :
179+ if os .WIFEXITED (status ) and os .WEXITSTATUS (status ) == 0 :
180+ break
181+ if os .WIFSTOPPED (status ):
182+ logger .warning ("Process stopped by signal %d" % os .WSTOPSIG (status ))
183+ else :
184+ if os .WIFSIGNALED (status ):
185+ raise CrashError ("Unexpected exit by signal %d" % os .WTERMSIG (status ))
186+ raise CrashError ("Unexpected exit status %d" % os .WEXITSTATUS (status ))
187+
188+ time .sleep (0.5 )
189+
190+ now = datetime .datetime .now ()
191+ if self .timeout and ((now - start ).seconds > self .timeout ):
192+ os .kill (self .child , signal .SIGKILL )
193+ os .waitpid (- 1 , os .WNOHANG )
194+ raise TimeoutError ("Timed out after %d seconds" % self .timeout )
195+
196+ except OSError as ose :
197+ import errno
198+
199+ if ose .errno != errno .EINTR :
200+ raise ose
201+ except JobError :
202+ self ._handle_job_exception (job )
203+ finally :
204+ # If the child process' job called os._exit manually we need to
205+ # finish the clean up here.
206+ if self .job ():
207+ self .done_working ()
208+
209+ logger .debug ('done waiting' )
210+ else :
211+ self ._setproctitle ("Processing %s since %s" %
212+ (job ._queue ,
213+ datetime .datetime .now ()))
214+ logger .info ('Processing %s since %s' %
215+ (job ._queue , datetime .datetime .now ()))
216+ self .after_fork (job )
217+
218+ # re-seed the Python PRNG after forking, otherwise
219+ # all job process will share the same sequence of
220+ # random numbers
221+ random .seed ()
222+
223+ self .process (job )
224+ os ._exit (0 )
225+ self .child = None
226+
210227 def before_fork (self , job ):
211228 """
212229 hook for making changes immediately before forking to process
@@ -227,21 +244,33 @@ def before_process(self, job):
227244 def process (self , job = None ):
228245 if not job :
229246 job = self .reserve ()
247+
248+ job_failed = False
230249 try :
231- self .working_on (job )
232- job = self .before_process (job )
233- return job .perform ()
234- except Exception , e :
235- exceptionType , exceptionValue , exceptionTraceback = sys .exc_info ()
236- logger .exception ("%s failed: %s" % (job , e ))
237- job .fail (exceptionTraceback )
238- self .failed ()
239- else :
240- logger .info ('completed job' )
241- logger .debug ('job details: %s' % job )
250+ try :
251+ self .working_on (job )
252+ job = self .before_process (job )
253+ return job .perform ()
254+ except Exception :
255+ job_failed = True
256+ self ._handle_job_exception (job )
257+ except SystemExit , e :
258+ if e .code != 0 :
259+ job_failed = True
260+ self ._handle_job_exception (job )
261+
262+ if not job_failed :
263+ logger .info ('completed job' )
264+ logger .debug ('job details: %s' % job )
242265 finally :
243266 self .done_working ()
244267
268+ def _handle_job_exception (self , job ):
269+ exceptionType , exceptionValue , exceptionTraceback = sys .exc_info ()
270+ logger .exception ("%s failed: %s" % (job , exceptionValue ))
271+ job .fail (exceptionTraceback )
272+ self .failed ()
273+
245274 def reserve (self , timeout = 10 ):
246275 logger .debug ('checking queues %s' % self .queues )
247276 job = self .job_class .reserve (self .queues , self .resq , self .__str__ (), timeout = timeout )
0 commit comments