2012-10-23 130 views
0

我有一個流程設置爲識別何時將文件放入目錄。接下來,我需要運行處理文件的Bash腳本(相當密集的處理)。該腳本抓取PDF,創建臨時目錄,將PDF分解爲單獨的PNG文件,針對每個圖像運行OCR處理器,將結果轉換爲單頁PDF,然後將所有PDF合併到單頁多頁PDF中來自OCR的文本層。從騾ESB運行shell腳本

問題是,觸發了10個併發轉換後的Bash腳本扼流圈。現在我有Mule ESB監聽新文件,然後觸發每個文件的腳本,傳遞適當的參數。不幸的是,騾有兩個任務,聽 - >觸發。我們將在該目錄中有超過200個文件需要排隊等待處理,最好是每次處理5個文件。我如何讓Mule限制觸發的併發進程數量?

下面是我最初的流量草案:

<?xml version="1.0" encoding="UTF-8"?> 

<mule xmlns:cxf="http://www.mulesoft.org/schema/mule/cxf" xmlns:scripting="http://www.mulesoft.org/schema/mule/scripting" xmlns:http="http://www.mulesoft.org/schema/mule/http" xmlns:file="http://www.mulesoft.org/schema/mule/file" xmlns="http://www.mulesoft.org/schema/mule/core" xmlns:doc="http://www.mulesoft.org/schema/mule/documentation" xmlns:spring="http://www.springframework.org/schema/beans" version="CE-3.3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation=" 
http://www.mulesoft.org/schema/mule/file http://www.mulesoft.org/schema/mule/file/current/mule-file.xsd 
http://www.mulesoft.org/schema/mule/scripting http://www.mulesoft.org/schema/mule/scripting/current/mule-scripting.xsd 
http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-current.xsd 
http://www.mulesoft.org/schema/mule/core http://www.mulesoft.org/schema/mule/core/current/mule.xsd 
http://www.mulesoft.org/schema/mule/cxf http://www.mulesoft.org/schema/mule/cxf/current/mule-cxf.xsd 
http://www.mulesoft.org/schema/mule/http http://www.mulesoft.org/schema/mule/http/current/mule-http.xsd "> 
    <configuration> 
    <default-threading-profile doThreading="false"/> 
    </configuration> 

    <queued-asynchronous-processing-strategy name="limitThreads" maxThreads="2"/> 

    <flow name="Poll_DirectoryFlow1" doc:name="Poll_DirectoryFlow1" processingStrategy="limitThreads"> 
     <file:inbound-endpoint path="/home/administrator/Downloads/Input" responseTimeout="10000" doc:name="File" pollingFrequency="5000" fileAge="5000"> 

     </file:inbound-endpoint> 
     <scripting:component doc:name="Script"> 
      <scripting:script engine="Groovy"> 
       <property key="originalFilename" value="#[header:originalFilename]"/> 
       <scripting:text><![CDATA[def filename = message.getInboundProperty('originalFilename') 
                 println "$filename" 
                 def directory = message.getInboundProperty('directory') 
                 println "$directory" 
                 "mkdir processed".execute() 
                 def command = ["/home/administrator/ocr.sh", "$directory/$filename", "/home/administrator/Downloads/Output/$filename"] 
                 println "$command" 
                 def proc = "pwd".execute() 
                 command.execute() 
                 proc.waitFor() 
                 println "${proc.in.text}"]]></scripting:text> 
      </scripting:script> 
     </scripting:component> 
     <echo-component doc:name="Echo"/>   
    </flow> 
</mule> 

下面是實際的bash腳本(給出了我們使用的是什麼工具的一些提示):

#!/bin/bash 

#Setting variables 
PARAM=$# 
TMPDIR=./split 
INFILENAME=${1##*/} 
OUTFILENAME=${2##*/} 
echo "1 is $1" 
echo "2 is $2" 
echo "infilename is $INFILENAME" 
echo "outfilename is $OUTFILENAME" 

#Logging I/O filenames 
echo "infile: $1" >> error.log 
echo "outfile: $2" >> error.log 

#If the temporary directory doesn't exist, make it 
if [ ! -d "$TMPDIR" ]; 
then 
    mkdir $TMPDIR 
fi 

#Check to see that the correct number of params have been passed. 
if [[ $PARAM -lt 2 ]]; 
then 
    echo "Usage: $0 source.pdf output.pdf" 
    echo "output.pdf is the desired output file" 
    echo "source.pdf is a file to be OCR'd" 
    exit 1 
fi 

#Make sure the input file is a PDF 
if [ "${1##*.}" == "pdf" ]; 
then 
    multilayer=false 

    #Check to see if the input file is a multi-layered pdf with searchable text 
     if grep -Fl "Font" "$1"; then multilayer=true; fi 

    #If it's not multi-layered, then perform the OCR 
    if [ "$multilayer" == "false" ]; 
    then 
     mkdir $TMPDIR/"$INFILENAME/" 
     echo "making temporary directory $TMPDIR/$INFILENAME" 
     #Split the PDF into pdf's of one page per df in a temporary directory 
     pdftk "$1" burst output "$TMPDIR/$INFILENAME/pg_%04d.pdf" 
     echo "burse output to $TMPDIR/$INFILENAME/pg_%04d.pdf" 
     mv "$1" processed/ 
     for files in "$TMPDIR/$INFILENAME/"* 
      do 
      echo "$files" 
        filename=$(basename "$files") 
        filename="${filename%.*}" 

      #Convert the pdf page into an image 
        gs -r300 -o "$TMPDIR/$INFILENAME/$filename.jpeg" -sDEVICE=jpeg "$TMPDIR/$INFILENAME/$filename.pdf" 

      #Perform the OCR against the image 
        tesseract "$TMPDIR/$INFILENAME/$filename.jpeg" "$TMPDIR/$INFILENAME/$filename" hocr 

      #Combine the OCR'd image and OCR'd text into a multi-layer PDF file of that page 
        hocr2pdf -i "$TMPDIR/$INFILENAME/$filename.jpeg" -o "$TMPDIR/$INFILENAME/$filename.pdf" < "$TMPDIR/$INFILENAME/$filename.html" 
        compressed="$filename-compressed.pdf" 

      #Compress the multi-layered PDF of the page 
        gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -sOutputFile="$TMPDIR/$INFILENAME/$compressed $TMPDIR/$INFILENAME/$filename.pdf" 
        mv "$TMPDIR/$INFILENAME/$compressed" "$TMPDIR/$INFILENAME/$filename" 
      done 

     #Concatenate all of the multiline PDF pages into a single PDF file 
     pdftk "$TMPDIR/$INFILENAME/"*.pdf cat output "$OUTFILENAME" 
     compressed="$OUTFILENAME-compressed.pdf" 

     #Compress the multi-layered PDF 
     gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -sOutputFile="$compressed" "$OUTFILENAME" 
     mv "$compressed" "$2" 
     rm -rf "$TMPDIR/$INFILENAME" 
    else 
     echo "The input file is multi-layered" 
     mv "$1" "$2" 
    fi 
else 
    echo "Please enter a valid input pdf file" 
    exit 2 
fi 

回答

0

@genjosanzo ...你把我放在正確的軌道思考處理策略。以下是最終工作的解決方案:

<?xml version="1.0" encoding="UTF-8"?> 

<mule xmlns:cxf="http://www.mulesoft.org/schema/mule/cxf" 
    xmlns:scripting="http://www.mulesoft.org/schema/mule/scripting" 
    xmlns:http="http://www.mulesoft.org/schema/mule/http" xmlns:file="http://www.mulesoft.org/schema/mule/file" 
    xmlns="http://www.mulesoft.org/schema/mule/core" xmlns:doc="http://www.mulesoft.org/schema/mule/documentation" 
    xmlns:spring="http://www.springframework.org/schema/beans" version="CE-3.3.0" 
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
    xsi:schemaLocation=" 
http://www.mulesoft.org/schema/mule/file http://www.mulesoft.org/schema/mule/file/current/mule-file.xsd 
http://www.mulesoft.org/schema/mule/scripting http://www.mulesoft.org/schema/mule/scripting/current/mule-scripting.xsd 
http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-current.xsd 
http://www.mulesoft.org/schema/mule/core http://www.mulesoft.org/schema/mule/core/current/mule.xsd 
http://www.mulesoft.org/schema/mule/cxf http://www.mulesoft.org/schema/mule/cxf/current/mule-cxf.xsd 
http://www.mulesoft.org/schema/mule/http http://www.mulesoft.org/schema/mule/http/current/mule-http.xsd "> 

    <queued-asynchronous-processing-strategy 
     name="limitThreads" maxThreads="7" 
     doc:name="Queued Asynchronous Processing Strategy" /> 
    <flow name="Poll_DirectoryFlow1" doc:name="Poll_DirectoryFlow1" 
     processingStrategy="limitThreads"> 
     <file:inbound-endpoint path="/home/administrator/Downloads/Input" 
      responseTimeout="10000" doc:name="File" pollingFrequency="60000" 
      fileAge="5000"> 
      <file:filename-regex-filter pattern="^.*\.(pdf)$" 
       caseSensitive="false" /> 
     </file:inbound-endpoint> 
     <scripting:component doc:name="Script"> 
      <scripting:script engine="Groovy"> 
       <scripting:text><![CDATA[def filename = message.getInboundProperty('originalFilename') 
       println "$filename" 
       def directory = message.getInboundProperty('directory') 
       println "$directory" 
       "mkdir processed".execute() 
       def command = ["/home/administrator/ocr.sh", "$directory/$filename", "/home/administrator/Downloads/Output/$filename"] 
       println "$command" 
       def cmd = command.execute() 
       cmd.waitFor() 
       println "$filename has completed processing"]]></scripting:text> 
      </scripting:script> 
     </scripting:component> 
     <echo-component doc:name="Echo"/> 
    </flow> 
</mule> 
+0

使用Groovy高於MEL的任何原因? –

+0

因爲我不知道MEL ......大聲笑。 – Thaneofife

+0

@DavidDossot如何使用MEL執行shell腳本? – Daniel

1

一個簡單的解決方案,您的問題將不要使用您正在設置的基於線程配置文件的策略,並使用配置如下的池化java組件替換腳本組件:

<pooled-component class="org.mule.PooledComponent"> 
     <pooling-profile exhaustedAction="WHEN_EXHAUSTED_WAIT" maxActive="0" maxWait="-1" initialisationPolicy="INITIALISE_NONE"/> 
</pooled-component> 

您應該將您的bash腳本的調用放在該組件中。你可以找到它的文檔here