Option Explicit ' ' Unwraps and formats raw scans of books. ' Removes excessive blank lines and spaces, ' removes page numbers and headers. Ignores ' any file that appears to already have ' unwrapped text. ' ' Written by Eric Phelps ' http://www.ericphelps.com ' ' Public Domain. Distribute freely. NO WARRANTY ' Just because it worked a few times for me on my computer, ' with my version of scripting, and on the few test files ' I checked does NOT mean it will work for you! Consider ' this code a STARTING POINT, not a finished product! ' ' This program may be automated since it was DESIGNED ' not to pop up any message boxes or error dialogs ' requiring user intervention. In theory. ' ' Typical Windows 95 DOS batch command line: ' for %%x in (*.txt) do start /w cscript.exe unwrap.vbs %%x ' You can also drop etext text files on this script. ' Main Wscript.Quit 0 Sub Main() Dim filEtextFile 'As Scripting.File Dim strEtextFileName 'As String Dim tsEtext 'As Scripting.TextStream Dim strEtextContent 'As String Dim strEtextLine 'As String Dim strOldEtextLine 'As String Dim lngPosition, lngStart, lngEnd 'As Long Dim intCounter 'As Integer Dim intUnwrapped 'As Integer Dim blnPunctuation 'As Boolean Dim fs 'As Scripting.FileSystemObject Const ForReading = 1 'Scripting.IOMode Const ForWriting = 2 'Scripting.IOMode On Error Resume Next '''''''''' Get the file name passed as an argument Set fs = CreateObject("Scripting.FileSystemObject") If Wscript.Arguments.Count <> 1 Then CreateObject("WScript.Shell").Popup "You must pass a text file name on the command line!", 5, "Error" Wscript.Quit 1 End If strEtextFileName = Wscript.Arguments(0) If Not fs.FileExists(strEtextFileName) Then CreateObject("WScript.Shell").Popup Wscript.Arguments(0) & " is not a legitimate file name.", 5, "Error" Wscript.Quit 1 End If If Lcase(Right(strEtextFileName, 4)) <> ".txt" Then CreateObject("WScript.Shell").Popup Wscript.Arguments(0) & " is not a text (.TXT) file name.", 5, "Error" Wscript.Quit 1 End If Set filEtextFile = fs.GetFile(strEtextFileName) Status "Processing file: " & filEtextFile.Name 'Read the first few lines to see if it is unwrapped Set tsEtext = filEtextFile.OpenAsTextStream(ForReading) 'Get past the titles and introductions into the body For intCounter = 1 to 100 strEtextLine = tsEtext.ReadLine Next 'Check the body intUnwrapped = 0 For intCounter = 1 to 100 strEtextLine = Trim(tsEtext.ReadLine) If Len(strEtextLine) > 128 Then If Instr(1, "ABCDEFGHIJKLMNOPQRSTUVWXYZ""", Left(strEtextLine, 1), vbBinaryCompare) <> 0 Then intUnWrapped = intUnWrapped + 1 End If End If Next If intUnWrapped > 20 Then Status "This text is already unwrapped." Wscript.Quit 0 Else Status "This text is not unwrapped." End If 'Read the entire file for unwrapping Set tsEtext = filEtextFile.OpenAsTextStream(ForReading) Status "Loading etext file into memory for processing..." strEtextContent = tsEtext.ReadAll tsEtext.Close ''''''''''Unwrap the text 'Recover from stupid FTP text download strEtextContent = Replace(strEtextContent, vbCr & vbCrLf, vbCrLf) 'Remove excessive linefeeds Do While Instr(strEtextContent, vbCrLf & vbCrLf & vbCrLf) <> 0 strEtextContent = Replace(strEtextContent, vbCrLf & vbCrLf & vbCrLf, vbCrLf & vbCrLf) Loop Do While Instr(strEtextContent, vbLf & vbLf & vbLf) <> 0 strEtextContent = Replace(strEtextContent, vbLf & vbLf & vbLf, vbLf & vbLf) Loop Do While Instr(strEtextContent, vbCr & vbCr & vbCr) <> 0 strEtextContent = Replace(strEtextContent, vbCr & vbCr & vbCr, vbCr & vbCr) Loop 'Remove hyphenation Status "Unwrapping hyphenated words" strEtextContent = Replace(strEtextContent, "-" & vbCrLf, "") strEtextContent = Replace(strEtextContent, "-" & vbCr, "") strEtextContent = Replace(strEtextContent, "-" & vbLf, "") 'Save double linefeeds temporarily Status "Saving double carriage returns" strEtextContent = Replace(strEtextContent, vbCrLf & vbCrLf, "^P") strEtextContent = Replace(strEtextContent, vbCr & vbCr, "^P") strEtextContent = Replace(strEtextContent, vbLf & vbLf, "^P") 'Unwrap all text Status "Unwrapping text" strEtextContent = Replace(strEtextContent, vbCrLf, " ") strEtextContent = Replace(strEtextContent, vbCr, " ") strEtextContent = Replace(strEtextContent, vbLf, " ") 'Replace the saved double linefeeds with single linefeeds Status "Restoring double carriage returns" strEtextContent = Replace(strEtextContent, "^P", vbCrLf) 'Replace double spaces with single spaces. Status "Replacing double spaces with single spaces" Do While Instr(strEtextContent, " ") strEtextContent = Replace(strEtextContent, " ", " ") Loop Status "Replacing single quotes with double quotes" Do While Instr(strEtextContent, "''") strEtextContent = Replace(strEtextContent, "''", """") Loop Do While Instr(strEtextContent, " '") strEtextContent = Replace(strEtextContent, " '", " """) Loop Do While Instr(strEtextContent, "' ") strEtextContent = Replace(strEtextContent, "' ", """ ") Loop Do While Instr(strEtextContent, "'.") strEtextContent = Replace(strEtextContent, "'.", """.") Loop Do While Instr(strEtextContent, "'" & vbCrLf) strEtextContent = Replace(strEtextContent, "'" & vbCrLf, """" & vbCrLf) Loop Do While Instr(strEtextContent, vbCrLf & "'") strEtextContent = Replace(strEtextContent, vbCrLf & "'", vbCrLf & """") Loop Status "Replacing common mis-scans" Do While Instr(strEtextContent, "Tm") strEtextContent = Replace(strEtextContent, "Tm", "I'm") Loop Do While Instr(strEtextContent, " 1 ") strEtextContent = Replace(strEtextContent, " 1 ", " I ") Loop ''''''''''Write the changed text back to the file Status "Saving unwrapped text to file" Set tsEtext = filEtextFile.OpenAsTextStream(ForWriting) tsEtext.Write strEtextContent strEtextContent = "" tsEtext.Close Status "Removing page numbers and embedded title lines" Set tsEtext = filEtextFile.OpenAsTextStream(ForReading) 'Read the first four lines and allow anything. strEtextContent = tsEtext.ReadLine & vbCrLf strEtextContent = strEtextContent & tsEtext.ReadLine & vbCrLf strEtextContent = strEtextContent & tsEtext.ReadLine & vbCrLf strOldEtextLine = Trim(tsEtext.ReadLine) 'Read the rest of the file, removing page numbers and titles, etc.. Do Until tsEtext.AtEndOfStream strEtextLine = tsEtext.ReadLine strEtextLine = Trim(strEtextLine) If strEtextLine <> "" Then 'Remove lines that contain tabs If Instr(strEtextLine, vbTab) <> 0 Then strEtextLine = " " End If 'Remove short lines that start with numbers If Instr("123456789", Left(strEtextLine, 1)) Then If Len(strEtextLine) < 60 Then If Right(strEtextLine, 1) <> "." Then strEtextLine = " " End If End If End If 'Remove short lines that end with numbers If Instr("0123456789", Right(strEtextLine, 1)) Then If Len(strEtextLine) < 60 Then strEtextLine = " " End If End If 'Remove short lines that have no ending punctuation If Len(strEtextLine) < 40 Then blnPunctuation = False If Right(strEtextLine, 1) = "." Then blnPunctuation = True If Right(strEtextLine, 1) = "!" Then blnPunctuation = True If Right(strEtextLine, 1) = "?" Then blnPunctuation = True If Right(strEtextLine, 1) = """" Then blnPunctuation = True If Not blnPunctuation Then strEtextLine = " " End If End If 'Four possibilities: good-good, good-bad, bad-good, bad-bad 'Good lines are real lines. Bad lines are header lines 'that have been turned into a single " " by the above code. 'Good-good If ((strOldEtextLine <> " ") And (strEtextLine <> " ")) Then strEtextContent = strEtextContent & strOldEtextLine & vbCrLf End If 'Good-bad If ((strOldEtextLine <> " ") And (strEtextLine = " ")) Then strEtextContent = strEtextContent & strOldEtextLine & " " End If 'Bad-good If ((strOldEtextLine = " ") And (strEtextLine <> " ")) Then 'Do nothing End If 'Bad-bad If ((strOldEtextLine = " ") And (strEtextLine = " ")) Then 'Do nothing End If strOldEtextLine = strEtextLine Loop strEtextContent = strEtextContent & strOldEtextLine ''''''''''Write the changed text back to the file Status "Saving changes for the last time" Set tsEtext = filEtextFile.OpenAsTextStream(ForWriting) tsEtext.Write strEtextContent strEtextContent = "" tsEtext.Close 'Finished! Tell the world. CreateObject("WScript.Shell").Popup "Finished " & filEtextFile.Name, 2, "Success" End Sub Sub Status (strMessage) 'If the program was run with CSCRIPT, this writes a 'line into the DOS box. If run with WSCRIPT, it does nothing. If Lcase(Right(Wscript.FullName, 12)) = "\cscript.exe" Then Wscript.Echo strMessage End If End Sub