|
1 | 1 | package kscript.text |
2 | 2 |
|
| 3 | +import kscript.stopIfNot |
| 4 | + |
3 | 5 | /** |
4 | 6 | * Utility methods to allow for awk-like data processing using kscript. |
5 | 7 | * |
@@ -36,7 +38,7 @@ fun Sequence<String>.split(separator: String = "\t"): Sequence<Row> { |
36 | 38 | return this.map { it.split(separator) } |
37 | 39 | } |
38 | 40 |
|
39 | | -/** awk-like convenience wrapper around columns->map->print */ |
| 41 | +/** awk-like convenience wrapper around split->map->join->print */ |
40 | 42 | fun Sequence<String>.awk(separator: String = "\t", rule: (Row) -> String) = split(separator).map { rule(it) }.print() |
41 | 43 |
|
42 | 44 |
|
@@ -65,5 +67,69 @@ fun Sequence<Row>.print(separator: String = "\t") = join(separator).print() |
65 | 67 | fun List<Row>.print() = forEach { println(it) } |
66 | 68 |
|
67 | 69 |
|
| 70 | +// |
| 71 | +// Column Select |
| 72 | +// |
| 73 | + |
| 74 | + |
| 75 | +/** Internal representations for column selection indices. Usually not use directly but rather via [with] and [without]. |
| 76 | + */ |
| 77 | +abstract class ColSelect(val indices: Array<Int> = emptyArray()) { |
| 78 | + abstract fun and(column: Int): ColSelect |
| 79 | + abstract fun and(range: IntRange): ColSelect |
| 80 | +} |
| 81 | + |
| 82 | +class PosSelect(arrayOf: Array<Int>) : ColSelect(arrayOf) { |
| 83 | + override fun and(column: Int) = PosSelect(arrayOf(*indices, column)) |
| 84 | + override fun and(range: IntRange) = PosSelect(arrayOf(*indices, *range.toList().toTypedArray())) |
| 85 | +} |
| 86 | + |
| 87 | +class NegSelect(arrayOf: Array<Int>) : ColSelect(arrayOf) { |
| 88 | + override fun and(column: Int) = NegSelect(arrayOf(*indices, column)) |
| 89 | + override fun and(range: IntRange) = NegSelect(arrayOf(*indices, *range.toList().toTypedArray())) |
| 90 | +} |
| 91 | + |
| 92 | +/** Starts building a column selection index. Both positive and negative indices are supported. */ |
| 93 | +fun with(index: Int) = PosSelect(arrayOf(index)) |
| 94 | + |
| 95 | +fun with(range: IntRange) = PosSelect(range.toList().toTypedArray()) |
| 96 | +fun without(index: Int) = NegSelect(arrayOf(index)) |
| 97 | +fun without(range: IntRange) = NegSelect(range.toList().toTypedArray()) |
| 98 | + |
| 99 | + |
| 100 | +private fun retainColumn(selectIndex: ColSelect, colIndex: Int): Boolean { |
| 101 | + val indexInRange = selectIndex.indices.contains(colIndex) |
| 102 | + |
| 103 | + return if (selectIndex is PosSelect) indexInRange else !indexInRange |
| 104 | +} |
| 105 | + |
| 106 | +/** |
| 107 | + * Select or remove columns by providing an index-vector. Positive selections are done with [with] and negative selections with [without]. Both methods implement a [builder][https://en.wikipedia.org/wiki/Builder_pattern] to construct more complex selectors. |
| 108 | + */ |
| 109 | +fun Sequence<Row>.select(vararg colIndices: Int): Sequence<Row> { |
| 110 | + val isPositive = colIndices.all { it > 0 } |
| 111 | + stopIfNot(isPositive || colIndices.all { it < 0 }) { |
| 112 | + " Can not mix positive and negative selections" |
| 113 | + } |
| 114 | + |
| 115 | + val selector = if (isPositive) PosSelect(arrayOf(*colIndices.toTypedArray())) else NegSelect(arrayOf(*colIndices.toTypedArray())) |
| 116 | + |
| 117 | + return select(selector) |
| 118 | +} |
| 119 | + |
| 120 | +fun Sequence<Row>.select(columns: ColSelect): Sequence<Row> { |
| 121 | + // more efficient but does not allow to change the order |
| 122 | + // return map { it.filterIndexed { index, _ -> retainColumn(columns, index + 1) } } |
| 123 | + |
| 124 | + return if (columns is PosSelect) { |
| 125 | + // positive selection |
| 126 | + map { row -> columns.indices.map { row[it - 1] } } |
| 127 | + } else { |
| 128 | + // negative selection |
| 129 | + map { it.filterIndexed { index, _ -> !columns.indices.contains(index - 1) } } |
| 130 | + } |
| 131 | +} |
| 132 | + |
| 133 | + |
68 | 134 | // todo add krangl ColNames interface here |
69 | 135 |
|
0 commit comments