From 0b9c78592263746d93ecba023282da3ba3c1eb2a Mon Sep 17 00:00:00 2001 From: hammonds <JPHammonds@anl.gov> Date: Thu, 5 Dec 2019 12:38:30 -0600 Subject: [PATCH] More corrections from use and also add a section demonstrating workflows and processing jobs. --- Installation/APSDeveloperInstallation.md | 182 +++++++++++++++++- .../DataManagementSplitSystemSetup.md | 181 ++++++++++++++++- 2 files changed, 360 insertions(+), 3 deletions(-) diff --git a/Installation/APSDeveloperInstallation.md b/Installation/APSDeveloperInstallation.md index 537c83b2..2f1bc793 100644 --- a/Installation/APSDeveloperInstallation.md +++ b/Installation/APSDeveloperInstallation.md @@ -169,7 +169,7 @@ This command will * Create an experiment named `e1`with - The three experimenters `jprofessor`, `gpostdoc` & `jgradstudent` - The data that is being collected will be found at `/home/dmadmin/testData` - - Any data/files found in `/home/dmadmin/testData` will be found in a directory `TEST/e1/MyFirstExperiment` of the storage location defined for the Data Storage service. + - Any data/files found in `/home/dmadmin/testData` will be found in a directory `TEST/e1/MyFirstExperiment` of the storage location defined for the Data Storage service. NOTE: if the directory `/home/dmadmin/testData` does not exist, then the upload process will fail. Output like the following @@ -210,4 +210,182 @@ and u'storageDirectory': u'/home/dmadmin/storage/TEST/e1', u'storageHost': u'localhost', u'storageUrl': u'extrepid://localhost/home/dmadmin/storage/TEST/e1'} -``` \ No newline at end of file +``` + +Next step will add a workflow and then execute this workflow. This workflow is an example pulled from the comments in the file workflowProcApi.py (owner name has been changed to match user dmtest). It creates a minimal version of a workflow that grabs the md5sum of a given file. The workflow is defined by the following + +``` + { + 'name' : 'example-01', + 'owner' : 'dmtest', + 'stages' : { + '01-START' : { + 'command' : '/bin/date +%Y%m%d%H%M%S', + 'outputVariableRegexList' : ['(?P<timeStamp>.*)'] + }, + '02-MKDIR' : { + 'command' : '/bin/mkdir -p /tmp/workflow.$timeStamp' + }, + '03-ECHO' : { + 'command' : '/bin/echo "START JOB ID: $id" > /tmp/workflow.$timeStamp/$id.out' + }, + '04-MD5SUM' : { + 'command' : '/bin/md5sum $filePath | cut -f1 -d" "', + 'outputVariableRegexList' : ['(?P<md5Sum>.*)'] + }, + '05-ECHO' : { + 'command' : 'echo "FILE $filePath MD5 SUM: $md5Sum" >> /tmp/workflow.$timeStamp/$id.out' + }, + '06-DONE' : { + 'command' : '/bin/echo "STOP JOB ID: $id" >> /tmp/workflow.$timeStamp/$id.out' + }, + }, + 'description' : 'Workflow Example 01' + } +``` + +This workflow can be added to the system with the command: + + > dm-upsert-workflow --py-spec=sampleWorkflow + +and will yield a result like: + +``` +id=5de938931d9a2030403a7dd0 name=example-02 owner=dmtest +``` + +This workflow can be executend by the command: + +> dm-start-processing-job --workflow-name=example-02 --workflow-owner=dmtest filePath:/home/dmadmin/testData/myData + +This will have a result like: + +``` +id=2f004219-0694-4955-af05-b29b48ce4c0a owner=dmtest status=pending startTime=1575566109.86 startTimestamp=2019/12/05 12:15:09 EST +``` + +More information can be found with `dm-get-processing-job` like: + + > dm-get-processing-job --id=2f004219-0694-4955-af05-b29b48ce4c0a --display-keys=ALL --display-format=pprint + +which returns + +```json +{ u'endTime': 1575566111.014859, + u'endTimestamp': u'2019/12/05 12:15:11 EST', + u'filePath': u'/home/dmadmin/testData/myData', + u'id': u'2f004219-0694-4955-af05-b29b48ce4c0a', + u'md5Sum': u'bac0be486ddc69992ab4e01eeade0b92', + u'nFiles': 1, + u'owner': u'dmtest', + u'runTime': 1.1574599742889404, + u'stage': u'06-DONE', + u'startTime': 1575566109.857399, + u'startTimestamp': u'2019/12/05 12:15:09 EST', + u'status': u'done', + u'timeStamp': u'20191205121510', + u'workflow': { u'description': u'Workflow Example 01', + u'id': u'5de938931d9a2030403a7dd0', + u'name': u'example-02', + u'owner': u'dmtest', + u'stages': { u'01-START': { u'childProcesses': { u'0': { u'childProcessNumber': 0, + u'command': u'/bin/date +%Y%m%d%H%M%S', + u'endTime': 1575566110.898553, + u'exitStatus': 0, + u'runTime': 0.007671833038330078, + u'stageId': u'01-START', + u'startTime': 1575566110.890881, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'20191205121510\n', + u'submitTime': 1575566110.859169, + u'workingDir': None}}, + u'command': u'/bin/date +%Y%m%d%H%M%S', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0, + u'outputVariableRegexList': [ u'(?P<timeStamp>.*)']}, + u'02-MKDIR': { u'childProcesses': { u'1': { u'childProcessNumber': 1, + u'command': u'/bin/mkdir -p /tmp/workflow.20191205121510', + u'endTime': 1575566110.942735, + u'exitStatus': 0, + u'runTime': 0.0035638809204101562, + u'stageId': u'02-MKDIR', + u'startTime': 1575566110.939171, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'', + u'submitTime': 1575566110.925104, + u'workingDir': None}}, + u'command': u'/bin/mkdir -p /tmp/workflow.$timeStamp', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0}, + u'03-ECHO': { u'childProcesses': { u'2': { u'childProcessNumber': 2, + u'command': u'/bin/echo "START JOB ID: 2f004219-0694-4955-af05-b29b48ce4c0a" > /tmp/workflow.20191205121510/2f004219-0694-4955-af05-b29b48ce4c0a.out', + u'endTime': 1575566110.972364, + u'exitStatus': 0, + u'runTime': 0.003882884979248047, + u'stageId': u'03-ECHO', + u'startTime': 1575566110.968481, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'', + u'submitTime': 1575566110.960305, + u'workingDir': None}}, + u'command': u'/bin/echo "START JOB ID: $id" > /tmp/workflow.$timeStamp/$id.out', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0}, + u'04-MD5SUM': { u'childProcesses': { u'3': { u'childProcessNumber': 3, + u'command': u'/bin/md5sum /home/dmadmin/testData/myData | cut -f1 -d" "', + u'endTime': 1575566110.985139, + u'exitStatus': 0, + u'runTime': 0.0030689239501953125, + u'stageId': u'04-MD5SUM', + u'startTime': 1575566110.98207, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'bac0be486ddc69992ab4e01eeade0b92\n', + u'submitTime': 1575566110.973093, + u'workingDir': None}}, + u'command': u'/bin/md5sum $filePath | cut -f1 -d" "', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0, + u'outputVariableRegexList': [ u'(?P<md5Sum>.*)']}, + u'05-ECHO': { u'childProcesses': { u'4': { u'childProcessNumber': 4, + u'command': u'echo "FILE /home/dmadmin/testData/myData MD5 SUM: bac0be486ddc69992ab4e01eeade0b92" >> /tmp/workflow.20191205121510/2f004219-0694-4955-af05-b29b48ce4c0a.out', + u'endTime': 1575566110.997652, + u'exitStatus': 0, + u'runTime': 0.0005791187286376953, + u'stageId': u'05-ECHO', + u'startTime': 1575566110.997073, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'', + u'submitTime': 1575566110.987421, + u'workingDir': None}}, + u'command': u'echo "FILE $filePath MD5 SUM: $md5Sum" >> /tmp/workflow.$timeStamp/$id.out', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0}, + u'06-DONE': { u'childProcesses': { u'5': { u'childProcessNumber': 5, + u'command': u'/bin/echo "STOP JOB ID: 2f004219-0694-4955-af05-b29b48ce4c0a" >> /tmp/workflow.20191205121510/2f004219-0694-4955-af05-b29b48ce4c0a.out', + u'endTime': 1575566111.011913, + u'exitStatus': 0, + u'runTime': 0.001583099365234375, + u'stageId': u'06-DONE', + u'startTime': 1575566111.01033, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'', + u'submitTime': 1575566111.002148, + u'workingDir': None}}, + u'command': u'/bin/echo "STOP JOB ID: $id" >> /tmp/workflow.$timeStamp/$id.out', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0}}}} +``` + +Note that the md5 sum of the file `/home/dmadmin/testData/myData` is listed in the `stdOut` of stage `04-MD5SUM` and is used in the command in stage `05-ECHO` which in creates a temp file in /tmp. \ No newline at end of file diff --git a/Installation/DataManagementSplitSystemSetup.md b/Installation/DataManagementSplitSystemSetup.md index 64dc43b7..1fa0a9c6 100644 --- a/Installation/DataManagementSplitSystemSetup.md +++ b/Installation/DataManagementSplitSystemSetup.md @@ -300,4 +300,183 @@ and u'storageDirectory': u'/home/dmadmin/storage/TEST/e1', u'storageHost': u'localhost', u'storageUrl': u'extrepid://localhost/home/dmadmin/storage/TEST/e1'} -``` \ No newline at end of file +``` + + +Next step will add a workflow and then execute this workflow. This workflow is an example pulled from the comments in the file workflowProcApi.py (owner name has been changed to match user dmtest). It creates a minimal version of a workflow that grabs the md5sum of a given file. The workflow is defined by the following + +``` + { + 'name' : 'example-01', + 'owner' : 'dmtest', + 'stages' : { + '01-START' : { + 'command' : '/bin/date +%Y%m%d%H%M%S', + 'outputVariableRegexList' : ['(?P<timeStamp>.*)'] + }, + '02-MKDIR' : { + 'command' : '/bin/mkdir -p /tmp/workflow.$timeStamp' + }, + '03-ECHO' : { + 'command' : '/bin/echo "START JOB ID: $id" > /tmp/workflow.$timeStamp/$id.out' + }, + '04-MD5SUM' : { + 'command' : '/bin/md5sum $filePath | cut -f1 -d" "', + 'outputVariableRegexList' : ['(?P<md5Sum>.*)'] + }, + '05-ECHO' : { + 'command' : 'echo "FILE $filePath MD5 SUM: $md5Sum" >> /tmp/workflow.$timeStamp/$id.out' + }, + '06-DONE' : { + 'command' : '/bin/echo "STOP JOB ID: $id" >> /tmp/workflow.$timeStamp/$id.out' + }, + }, + 'description' : 'Workflow Example 01' + } +``` + +This workflow can be added to the system with the command: + + > dm-upsert-workflow --py-spec=sampleWorkflow + +and will yield a result like: + +``` +id=5de938931d9a2030403a7dd0 name=example-02 owner=dmtest +``` + +This workflow can be executend by the command: + +> dm-start-processing-job --workflow-name=example-02 --workflow-owner=dmtest filePath:/home/dmadmin/testData/myData + +This will have a result like: + +``` +id=2f004219-0694-4955-af05-b29b48ce4c0a owner=dmtest status=pending startTime=1575566109.86 startTimestamp=2019/12/05 12:15:09 EST +``` + +More information can be found with `dm-get-processing-job` like: + + > dm-get-processing-job --id=2f004219-0694-4955-af05-b29b48ce4c0a --display-keys=ALL --display-format=pprint + +which returns + +```json +{ u'endTime': 1575566111.014859, + u'endTimestamp': u'2019/12/05 12:15:11 EST', + u'filePath': u'/home/dmadmin/testData/myData', + u'id': u'2f004219-0694-4955-af05-b29b48ce4c0a', + u'md5Sum': u'bac0be486ddc69992ab4e01eeade0b92', + u'nFiles': 1, + u'owner': u'dmtest', + u'runTime': 1.1574599742889404, + u'stage': u'06-DONE', + u'startTime': 1575566109.857399, + u'startTimestamp': u'2019/12/05 12:15:09 EST', + u'status': u'done', + u'timeStamp': u'20191205121510', + u'workflow': { u'description': u'Workflow Example 01', + u'id': u'5de938931d9a2030403a7dd0', + u'name': u'example-02', + u'owner': u'dmtest', + u'stages': { u'01-START': { u'childProcesses': { u'0': { u'childProcessNumber': 0, + u'command': u'/bin/date +%Y%m%d%H%M%S', + u'endTime': 1575566110.898553, + u'exitStatus': 0, + u'runTime': 0.007671833038330078, + u'stageId': u'01-START', + u'startTime': 1575566110.890881, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'20191205121510\n', + u'submitTime': 1575566110.859169, + u'workingDir': None}}, + u'command': u'/bin/date +%Y%m%d%H%M%S', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0, + u'outputVariableRegexList': [ u'(?P<timeStamp>.*)']}, + u'02-MKDIR': { u'childProcesses': { u'1': { u'childProcessNumber': 1, + u'command': u'/bin/mkdir -p /tmp/workflow.20191205121510', + u'endTime': 1575566110.942735, + u'exitStatus': 0, + u'runTime': 0.0035638809204101562, + u'stageId': u'02-MKDIR', + u'startTime': 1575566110.939171, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'', + u'submitTime': 1575566110.925104, + u'workingDir': None}}, + u'command': u'/bin/mkdir -p /tmp/workflow.$timeStamp', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0}, + u'03-ECHO': { u'childProcesses': { u'2': { u'childProcessNumber': 2, + u'command': u'/bin/echo "START JOB ID: 2f004219-0694-4955-af05-b29b48ce4c0a" > /tmp/workflow.20191205121510/2f004219-0694-4955-af05-b29b48ce4c0a.out', + u'endTime': 1575566110.972364, + u'exitStatus': 0, + u'runTime': 0.003882884979248047, + u'stageId': u'03-ECHO', + u'startTime': 1575566110.968481, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'', + u'submitTime': 1575566110.960305, + u'workingDir': None}}, + u'command': u'/bin/echo "START JOB ID: $id" > /tmp/workflow.$timeStamp/$id.out', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0}, + u'04-MD5SUM': { u'childProcesses': { u'3': { u'childProcessNumber': 3, + u'command': u'/bin/md5sum /home/dmadmin/testData/myData | cut -f1 -d" "', + u'endTime': 1575566110.985139, + u'exitStatus': 0, + u'runTime': 0.0030689239501953125, + u'stageId': u'04-MD5SUM', + u'startTime': 1575566110.98207, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'bac0be486ddc69992ab4e01eeade0b92\n', + u'submitTime': 1575566110.973093, + u'workingDir': None}}, + u'command': u'/bin/md5sum $filePath | cut -f1 -d" "', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0, + u'outputVariableRegexList': [ u'(?P<md5Sum>.*)']}, + u'05-ECHO': { u'childProcesses': { u'4': { u'childProcessNumber': 4, + u'command': u'echo "FILE /home/dmadmin/testData/myData MD5 SUM: bac0be486ddc69992ab4e01eeade0b92" >> /tmp/workflow.20191205121510/2f004219-0694-4955-af05-b29b48ce4c0a.out', + u'endTime': 1575566110.997652, + u'exitStatus': 0, + u'runTime': 0.0005791187286376953, + u'stageId': u'05-ECHO', + u'startTime': 1575566110.997073, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'', + u'submitTime': 1575566110.987421, + u'workingDir': None}}, + u'command': u'echo "FILE $filePath MD5 SUM: $md5Sum" >> /tmp/workflow.$timeStamp/$id.out', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0}, + u'06-DONE': { u'childProcesses': { u'5': { u'childProcessNumber': 5, + u'command': u'/bin/echo "STOP JOB ID: 2f004219-0694-4955-af05-b29b48ce4c0a" >> /tmp/workflow.20191205121510/2f004219-0694-4955-af05-b29b48ce4c0a.out', + u'endTime': 1575566111.011913, + u'exitStatus': 0, + u'runTime': 0.001583099365234375, + u'stageId': u'06-DONE', + u'startTime': 1575566111.01033, + u'status': u'done', + u'stdErr': u'', + u'stdOut': u'', + u'submitTime': 1575566111.002148, + u'workingDir': None}}, + u'command': u'/bin/echo "STOP JOB ID: $id" >> /tmp/workflow.$timeStamp/$id.out', + u'nCompletedChildProcesses': 1, + u'nQueuedChildProcesses': 0, + u'nRunningChildProcesses': 0}}}} +``` + +Note that the md5 sum of the file `/home/dmadmin/testData/myData` is listed in the `stdOut` of stage `04-MD5SUM` and is used in the command in stage `05-ECHO` which in creates a temp file in /tmp. \ No newline at end of file -- GitLab