Splitting huge file into various files

Help with writing and running scripts

Splitting huge file into various files

Postby ewibe » Tue Oct 06, 2009 4:29 am

I have a file containing about 2 million lines of this type:

Code: Select all
204/23-1                             3878.88             97
204/23-1                             3879.03             97
204/23-1                             3879.18             97
204/23-1                             3879.33             97
204/23-1                             3879.48             97
204/23-1                             3879.63             97
204/23-1                             3879.78             97
204/23-1                             3879.93             97
204/24-1A                            0458.63              0
204/24-1A                            0458.78              0
204/24-1A                            0458.93              0
204/24-1A                            0459.08              0
204/24-1A                            0459.23              0
204/24-1A                            0459.38              0
204/24-1A                            0459.53              0
204/24-1A                            0459.68              0

I want to split this file and save it with name based on the first column. The number of rows per file varies.

I was thinking using underscore as a replacement for the / in the name of the files.
It is not that important what is used as long as the files can be easily identified so I don't import the data into the wrong place afterwards.
ewibe
Newbie
 
Posts: 1
Joined: Tue Oct 06, 2009 4:22 am

Re: Splitting huge file into various files

Postby Mofi » Tue Oct 06, 2009 2:48 pm

Here is one solution. It is surely not the fastest and best one, but it worked for your example. Best test the script on a small file and when it works, run it on the huge file during a larger break or over night. Please read the comments at top of the script!

Code: Select all
/* Insert here the functions GetFilePath and GetFileName from
   http://www.ultraedit.com/files/scripts/FileNameFunctions.js
   or replace the function calls with fixed strings.
   Please note: The path string must end with a backslash and the file
   extension string is without a dot. And you must use 2 backslashes
   for every backslash in the path string like "C:\\Temp\\". */

// Find the document index of the active document. Copied from
// http://www.ultraedit.com/forums/viewtopic.php?f=52&t=4571
function getActiveDocumentIndex () {
   for (var nDocIndex = 0; nDocIndex < UltraEdit.document.length; nDocIndex++) {
      if (UltraEdit.activeDocument.path == UltraEdit.document[nDocIndex].path) return nDocIndex;
   }
   return -1;
}

var nDataFileIndex = getActiveDocumentIndex();

if (nDataFileIndex >= 0) {  // Is any file open?

   var sRow = "";
   var sData = "";
   var sField = "";
   var sFileName = "";
   var nFileCount = 0;
   var sFileExt  = GetFileExt(-1);
   var sFilePath = GetFilePath(-1);
   var DataFile = UltraEdit.document[nDataFileIndex];

   if (sFileExt == "") sFileExt = "csv";

   // Define the working environment for this script.
   UltraEdit.insertMode();
   UltraEdit.columnModeOff();
   UltraEdit.activeDocument.hexOff();
   UltraEdit.perlReOn();

   // Make sure the last line of the file has a line termination.
   DataFile.bottom();
   if (DataFile.isColNumGt(1)) {
      DataFile.insertLine();
      if (DataFile.isColNumGt(1)) {
         DataFile.deleteToStartOfLine();
      }
   }
   DataFile.top();       // Start from top of the file.
   UltraEdit.newFile();  // Open now a new file to avoid display updates.

   // Evaluate the file line by line until cursor reaches end of file.
   while (!DataFile.isEof()) {

      DataFile.selectLine();        // Select the current line.
      sRow = DataFile.selection;    // Get the selection into a variable.
      // Get the string from start of the line to first space or tab.
      var asFields = sRow.match(/^[^ \t\r\n]+/);

      // Blank lines should be ignored which means no matching string found.
      if(asFields) {
         // If this row starts with the same string as the row
         // before, then just append this row to the existing data.
         if (sField == asFields[0]) {
            sData += sRow;
         } else {                   // New field string detected.
            if (sData != "") {      // Some data already collected?
               UltraEdit.newFile(); // Write the data into a new file.
               UltraEdit.activeDocument.write(sData);
               // Build the file name and avoid invalid characters.
               sData = sField.replace(/[/:\>\<\?\\]/g,"_");
               sFileName = sFilePath+sData+"."+sFileExt;
               // Save the new file and close it.
               UltraEdit.saveAs(sFileName);
               UltraEdit.closeFile(UltraEdit.activeDocument.path,2);
               nFileCount++;
            }
            // Store current field string and row for next data block.
            sData = sRow;
            sField = asFields[0];
         }
      }
      DataFile.key("HOME");  // To unselect the selected line.
   }

   DataFile.top();      // Set the cursor back to top of the data file.
   if (sData != "") {   // Is there a block not already saved into a file?
      UltraEdit.document[UltraEdit.document.length-1].setActive();
      UltraEdit.activeDocument.write(sData);
      sData = sField.replace(/[/:\>\<\?\\]/g,"_");
      sFileName = sFilePath+sData+"."+sFileExt;
      UltraEdit.saveAs(sFileName);
   }
   // Close the new file created first (normally with data).
   UltraEdit.closeFile(UltraEdit.activeDocument.path,2);
   sData = (nFileCount == 1) ? " file." : " files."
   UltraEdit.messageBox("Script saved "+nFileCount+sData);
}
User avatar
Mofi
Grand Master
Grand Master
 
Posts: 4042
Joined: Thu Jul 29, 2004 11:00 pm
Location: Vienna


Return to Scripts

cron