Skip to content

Commit f98c8ea

Browse files
authored
Migrate NT sequence files from flat dir to hashed structure (#378)
* Migrate NT sequence files from flat dir to hashed structure
1 parent 43f4603 commit f98c8ea

File tree

6 files changed

+134
-43
lines changed

6 files changed

+134
-43
lines changed

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/RefNtSequenceModel.java

Lines changed: 32 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import htsjdk.samtools.util.StringUtil;
1919
import org.apache.commons.io.IOUtils;
2020
import org.apache.logging.log4j.Logger;
21-
import org.apache.logging.log4j.LogManager;
2221
import org.jetbrains.annotations.Nullable;
2322
import org.labkey.api.data.Container;
2423
import org.labkey.api.data.ContainerManager;
@@ -32,8 +31,11 @@
3231
import org.labkey.api.exp.api.ExpData;
3332
import org.labkey.api.exp.api.ExperimentService;
3433
import org.labkey.api.files.FileContentService;
34+
import org.labkey.api.security.Crypt;
3535
import org.labkey.api.security.User;
36+
import org.labkey.api.util.FileUtil;
3637
import org.labkey.api.util.MemTracker;
38+
import org.labkey.api.util.logging.LogHelper;
3739
import org.labkey.api.writer.PrintWriters;
3840

3941
import java.io.File;
@@ -55,7 +57,9 @@
5557
*/
5658
public class RefNtSequenceModel implements Serializable
5759
{
58-
private static final Logger _log = LogManager.getLogger(RefNtSequenceModel.class);
60+
private static final Logger _log = LogHelper.getLogger(RefNtSequenceModel.class, "Messages related to Reference NT Sequences");
61+
62+
public static String BASE_DIRNAME = ".sequences";
5963

6064
private int _rowid;
6165
private String _name;
@@ -414,7 +418,7 @@ public byte[] getSequenceBases()
414418

415419
public void createFileForSequence(User u, String sequence, @Nullable File outDir) throws IOException
416420
{
417-
File output = getExpectedSequenceFile(outDir);
421+
File output = getExpectedSequenceFile();
418422
if (output.exists())
419423
{
420424
output.delete();
@@ -439,9 +443,9 @@ public void createFileForSequence(User u, String sequence, @Nullable File outDir
439443
Table.update(u, ti, this, _rowid);
440444
}
441445

442-
private File getExpectedSequenceFile(@Nullable File outDir) throws IllegalArgumentException
446+
public File getExpectedSequenceFile() throws IllegalArgumentException
443447
{
444-
return new File(getSequenceDir(true, outDir), _rowid + ".txt.gz");
448+
return FileUtil.appendName(getHashedDir(true), _rowid + ".txt.gz");
445449
}
446450

447451
private Container getLabKeyContainer()
@@ -455,33 +459,17 @@ private Container getLabKeyContainer()
455459
return c;
456460
}
457461

458-
private File getSequenceDir(boolean create, @Nullable File outDir) throws IllegalArgumentException
462+
private File getBaseSequenceDir() throws IllegalArgumentException
459463
{
460464
Container c = getLabKeyContainer();
461-
File ret = outDir == null ? getReferenceSequenceDir(c) : outDir;
462-
if (create && !ret.exists())
463-
{
464-
ret.mkdirs();
465-
}
466-
467-
return ret;
468-
}
469-
470-
private File getReferenceSequenceDir(Container c) throws IllegalArgumentException
471-
{
472465
FileContentService fileService = FileContentService.get();
473466
File root = fileService == null ? null : fileService.getFileRoot(c, FileContentService.ContentType.files);
474467
if (root == null)
475468
{
476469
throw new IllegalArgumentException("File root not defined for container: " + c.getPath());
477470
}
478471

479-
return new File(root, ".sequences");
480-
}
481-
482-
public void writeSequence(Writer writer, int lineLength) throws IOException
483-
{
484-
writeSequence(writer, lineLength, null, null);
472+
return FileUtil.appendName(root, BASE_DIRNAME);
485473
}
486474

487475
public void writeSequence(Writer writer, int lineLength, Integer start, Integer end) throws IOException
@@ -562,6 +550,26 @@ public File getOffsetsFile()
562550
return null;
563551
}
564552

565-
return new File(d.getFile().getParentFile(), getRowid() + "_offsets.txt");
553+
return FileUtil.appendName(d.getFile().getParentFile(), getRowid() + "_offsets.txt");
554+
}
555+
556+
private File getHashedDir(boolean create)
557+
{
558+
File baseDir = getBaseSequenceDir();
559+
String digest = Crypt.MD5.digest(String.valueOf(getRowid()));
560+
561+
baseDir = FileUtil.appendName(baseDir, digest.substring(0,4));
562+
baseDir = FileUtil.appendName(baseDir, digest.substring(4,8));
563+
baseDir = FileUtil.appendName(baseDir, digest.substring(8,12));
564+
baseDir = FileUtil.appendName(baseDir, digest.substring(12,20));
565+
baseDir = FileUtil.appendName(baseDir, digest.substring(20,28));
566+
baseDir = FileUtil.appendName(baseDir, digest.substring(28,32));
567+
568+
if (create)
569+
{
570+
baseDir.mkdirs();
571+
}
572+
573+
return baseDir;
566574
}
567575
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SELECT core.executeJavaUpgradeCode('migrateSequenceDirs');
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
EXEC core.executeJavaUpgradeCode 'migrateSequenceDirs';

SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisMaintenanceTask.java

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import java.util.HashSet;
4747
import java.util.List;
4848
import java.util.Map;
49+
import java.util.Objects;
4950
import java.util.Set;
5051
import java.util.stream.Collectors;
5152
import java.util.stream.Stream;
@@ -301,10 +302,10 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel
301302
{
302303
//first sequences
303304
log.debug("Inspecting sequences");
304-
File sequenceDir = new File(root.getRootPath(), ".sequences");
305+
File sequenceDir = FileUtil.appendName(root.getRootPath(), ".sequences");
305306
TableInfo tableRefNtSequences = SequenceAnalysisSchema.getTable(SequenceAnalysisSchema.TABLE_REF_NT_SEQUENCES);
306307
TableSelector ntTs = new TableSelector(tableRefNtSequences, new SimpleFilter(FieldKey.fromString("container"), c.getId()), null);
307-
final Set<String> expectedSequences = new HashSet<>(10000, 1000);
308+
final Set<File> expectedSequences = new HashSet<>(10000, 1000);
308309
ntTs.forEach(RefNtSequenceModel.class, m -> {
309310
if (m.getSequenceFile() == null || m.getSequenceFile() == 0)
310311
{
@@ -319,26 +320,23 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel
319320
return;
320321
}
321322

322-
if (!d.getFile().exists())
323-
{
324-
log.error("expected sequence file does not exist for sequence: " + m.getRowid() + " " + m.getName() + ", expected: " + d.getFile().getPath());
325-
return;
326-
}
327-
328323
if (d.getFile().getAbsolutePath().toLowerCase().startsWith(sequenceDir.getAbsolutePath().toLowerCase()))
329324
{
330-
expectedSequences.add(d.getFile().getName());
325+
expectedSequences.add(d.getFile());
331326
}
332327
});
333328

334329
if (sequenceDir.exists())
335330
{
336-
for (File child : sequenceDir.listFiles())
331+
inspectSequenceDir(sequenceDir, expectedSequences, log);
332+
}
333+
334+
if (!expectedSequences.isEmpty())
335+
{
336+
for (File missing : expectedSequences)
337337
{
338-
if (!expectedSequences.contains(child.getName()))
339-
{
340-
deleteFile(child, log);
341-
}
338+
log.error("expected sequence file does not exist: " + missing.getPath());
339+
return;
342340
}
343341
}
344342

@@ -446,12 +444,12 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel
446444
continue;
447445
}
448446

449-
deleteFile(new File(child, fileName), log);
447+
deleteFile(FileUtil.appendName(child, fileName), log);
450448
}
451449
}
452450

453451
//check/verify tracks
454-
File trackDir = new File(child, "tracks");
452+
File trackDir = FileUtil.appendName(child, "tracks");
455453
if (trackDir.exists())
456454
{
457455
Set<String> expectedTracks = new HashSet<>();
@@ -486,7 +484,7 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel
486484
}
487485

488486
//check/verify chainFiles
489-
File chainDir = new File(child, "chainFiles");
487+
File chainDir = FileUtil.appendName(child, "chainFiles");
490488
if (chainDir.exists())
491489
{
492490
Set<String> expectedChains = new HashSet<>();
@@ -555,7 +553,7 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel
555553
}
556554
}
557555

558-
File sequenceOutputsDir = new File(root.getRootPath(), "sequenceOutputs");
556+
File sequenceOutputsDir = FileUtil.appendName(root.getRootPath(), "sequenceOutputs");
559557
if (sequenceOutputsDir.exists())
560558
{
561559
for (File child : sequenceOutputsDir.listFiles())
@@ -576,6 +574,24 @@ private void processContainer(Container c, Logger log) throws IOException, Pipel
576574
}
577575
}
578576

577+
private void inspectSequenceDir(File sequenceDir, Set<File> expectedSequences, Logger log) throws IOException
578+
{
579+
for (File child : Objects.requireNonNull(sequenceDir.listFiles()))
580+
{
581+
if (child.isDirectory())
582+
{
583+
inspectSequenceDir(child, expectedSequences, log);
584+
}
585+
else
586+
{
587+
if (!expectedSequences.remove(child))
588+
{
589+
deleteFile(child, log);
590+
}
591+
}
592+
}
593+
}
594+
579595
private void deleteFile(File f, Logger log) throws IOException
580596
{
581597
log.info("deleting sequence file: " + f.getPath());

SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ public String getName()
209209
@Override
210210
public Double getSchemaVersion()
211211
{
212-
return 12.331;
212+
return 12.332;
213213
}
214214

215215
@Override

SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisUpgradeCode.java

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,4 +229,69 @@ public void updateBarcodeRC(final ModuleContext moduleContext)
229229
});
230230
}
231231
}
232+
233+
/** called at 12.331-12.332*/
234+
@SuppressWarnings({"UnusedDeclaration"})
235+
@DeferredUpgrade
236+
public void migrateSequenceDirs(final ModuleContext moduleContext)
237+
{
238+
try
239+
{
240+
TableInfo ti = SequenceAnalysisSchema.getTable(SequenceAnalysisSchema.TABLE_REF_NT_SEQUENCES);
241+
TableSelector ts = new TableSelector(ti);
242+
List<RefNtSequenceModel> nts = ts.getArrayList(RefNtSequenceModel.class);
243+
_log.info(nts.size() + " total sequences to migrate");
244+
int processed = 0;
245+
for (RefNtSequenceModel nt : nts)
246+
{
247+
processed++;
248+
249+
if (processed % 1000 == 0)
250+
{
251+
_log.info("{} of {} sequence files migrated", processed, nts.size());
252+
}
253+
254+
ExpData legacyExpData = ExperimentService.get().getExpData(nt.getSequenceFile());
255+
if (legacyExpData == null)
256+
{
257+
_log.error("Missing ExpData for NT sequence: {}", nt.getSequenceFile());
258+
continue;
259+
}
260+
261+
File legacyFile = legacyExpData.getFile();
262+
if (!legacyFile.exists())
263+
{
264+
_log.error("Missing file for NT sequence: {}", legacyFile.getPath());
265+
continue;
266+
}
267+
268+
if (!RefNtSequenceModel.BASE_DIRNAME.equals(legacyFile.getParentFile().getName()))
269+
{
270+
_log.error("Sequence appears to have already been migrated, this might indicate a retry after a failed move: {}", legacyFile.getPath());
271+
continue;
272+
}
273+
274+
File newLocation = nt.getExpectedSequenceFile();
275+
if (!newLocation.getParentFile().exists())
276+
{
277+
newLocation.getParentFile().mkdirs();
278+
}
279+
280+
if (newLocation.exists())
281+
{
282+
_log.error("Target location for migrated sequence file exists, this might indicate a retry after a filed move: {}", newLocation.getPath());
283+
continue;
284+
}
285+
286+
FileUtils.copyFile(legacyFile, newLocation);
287+
legacyExpData.setDataFileURI(newLocation.toURI());
288+
legacyExpData.save(moduleContext.getUpgradeUser());
289+
legacyFile.delete();
290+
}
291+
}
292+
catch (Exception e)
293+
{
294+
_log.error("Error upgrading sequenceanalysis module", e);
295+
}
296+
}
232297
}

0 commit comments

Comments
 (0)