Skip to content

Commit c993913

Browse files
committed
added awk-like line processing api
1 parent f756455 commit c993913

10 files changed

Lines changed: 251 additions & 63 deletions

File tree

src/main/kotlin/kscript/AwkLike.kt

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
package kscript
2+
3+
/**
4+
* Utility methods to allow for awk-like data processing using kscript.
5+
*
6+
* The general usage pattern is to start with a `Sequence<String>` that is typically provided as `lines` by [kscript.resolveArgFile], and to end with one of the `print` extension methods provided below.
7+
*
8+
* In one-liners `lines` is implicitly added by `kscript` and the alle elements from `kscript.*` are imported. This allows for constructs such as
9+
* ```
10+
* kscript 'lines.filter{ it.contains("foo") }.print()' some_file.txt
11+
* kscript 'lines.filter{ it.contains("foo") }.print()'
12+
* cat some_file.txt | kscript 'lines.filter{ it.contains("foo") }.print()'
13+
*
14+
* ```
15+
*
16+
* @author Holger Brandl
17+
*/
18+
19+
20+
// for top-level vs member extensions see https://kotlinlang.org/docs/reference/extensions.html#scope-of-extensions
21+
22+
23+
/** For sake of readability we refer to a list of strings as a row here. */
24+
typealias Row = List<String>
25+
26+
//typealias RowSequence = Sequence<Row>
27+
//class RowSequence(input: Sequence<String>){
28+
//}
29+
30+
31+
/** Splits the lines of an input stream into [Row]s.
32+
*
33+
* @param separator The used separator character which defaults to tabs.
34+
*/
35+
fun Sequence<String>.split(separator: String = "\t"): Sequence<Row> {
36+
return this.map { it.split(separator) }
37+
}
38+
39+
/** awk-like convenience wrapper around columns->map->print */
40+
fun Sequence<String>.awk(separator: String = "\t", rule: (Row) -> String) = split(separator).map { rule(it) }.print()
41+
42+
43+
fun Sequence<Row>.map(vararg rules: (Row) -> String): Sequence<Row> {
44+
return map { splitLine -> rules.map { it(splitLine) } }
45+
}
46+
47+
/** Adds a new column to a row. */
48+
fun Sequence<Row>.add(rule: (Row) -> String): Sequence<Row> {
49+
return map { row -> listOf(*row.toTypedArray(), rule(row)) }
50+
}
51+
52+
53+
//@Deprecated("use kscript.awk() instead")
54+
//fun Sequence<String>.splitMap(vararg rules: (Row) -> String, separator: String = "\t", joinWith: String = separator) {
55+
// map { it.split(separator).let { splitLine -> rules.map { it(splitLine) } } }.print()
56+
//}
57+
58+
59+
fun Sequence<Row>.join(separator: String = "\t") = map { it.joinToString(separator) }
60+
61+
/** Joins rows with the provided `separator` and print them to `stdout`. */
62+
fun Sequence<Row>.print(separator: String = "\t") = join(separator).print()
63+
64+
65+
fun List<Row>.print() = forEach { println(it) }
66+
67+
68+
// todo add krangl ColNames interface here
69+

src/main/kotlin/kscript/FileUtil.kt

Lines changed: 0 additions & 36 deletions
This file was deleted.

src/main/kotlin/kscript/StreamUtil.kt

Lines changed: 49 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,67 @@
11
package kscript
22

3-
import java.io.BufferedReader
4-
import java.io.File
5-
import java.io.FileReader
3+
import java.io.*
4+
import java.util.zip.GZIPOutputStream
65

76

8-
// for top-level vs member extensions see https://kotlinlang.org/docs/reference/extensions.html#scope-of-extensions
9-
//object KscriptHelpers {}
7+
/** a `Sequence<String>` iterator for standard input */
8+
val stdin by lazy { generateSequence() { readLine() } }
109

11-
val stdin = generateSequence() { readLine() }
10+
fun linesFrom(file: File) = BufferedReader(FileReader(file)).lineSequence()
1211

12+
// just used for testing and development
13+
fun linesFrom(vararg lines: String) = lines.asSequence()
1314

14-
/** Read lines stdin or a file argument. Example 'argLines(1).map { it+"foo"}'. could be used either with "-" or file argument. */
15-
fun argLines(arg: String, stdinNames: List<String> = listOf("-", "stdin")): Sequence<String> {
16-
if (stdinNames.contains(arg)) return stdin
1715

18-
val inputFile = File(arg)
16+
/**
17+
* File argument processor that works similar to awk. If data is available on stdin, use it. If not expect a file argument and read from that one instead.
18+
* */
19+
fun resolveArgFile(args: Array<String>, position: Int = 0): Sequence<String> {
20+
if (stdin.iterator().hasNext()) return stdin
1921

20-
stopIfNot(inputFile.canRead()) { "Can not read from '${arg}'" }
22+
stopIfNot(args.isNotEmpty()) { "Missing file or input input stream" }
23+
stopIfNot(args.size >= position) { "arg position ${position} exceeds number of arguments ${args.size} " }
24+
25+
val fileArg = args[position]
26+
27+
// stdinNames: List<String> = listOf("-", "stdin")
28+
// if (stdinNames.contains(fileArg)) return stdin
29+
30+
val inputFile = File(fileArg)
31+
32+
stopIfNot(inputFile.canRead()) { "Can not read from '${fileArg}'" }
2133

2234
// todo we don't close the buffer with this approach
2335
// BufferedReader(FileReader(inputFile )).use { return it }
36+
2437
return BufferedReader(FileReader(inputFile)).lineSequence()
2538
}
2639

27-
fun argMap(arg: String, stdinNames: List<String> = listOf("-", "stdin"), trafo: (String) -> String) =
28-
argLines(arg, stdinNames).map { trafo(it) }.print()
2940

30-
fun argFilter(arg: String, stdinNames: List<String> = listOf("-", "stdin"), trafo: (String) -> Boolean) =
31-
argLines(arg, stdinNames).filter { trafo(it) }.print()
41+
/** Endpoint for a kscript pipe. */
42+
fun Sequence<String>.print() = forEach { println(it) }
43+
44+
/** Endpoint for a kscript pipe. */
45+
fun Iterable<String>.print() = forEach { println(it) }
46+
47+
48+
49+
//https://dzone.com/articles/readingwriting-compressed-and
50+
/** Save a list of items into a file. Output can be option ally zipped and a the stringifying operation can be changed from toString to custom operation if needed. */
51+
fun <T> Iterable<T>.saveAs(f: File,
52+
transform: (T) -> String = { it.toString() },
53+
separator: Char = '\n',
54+
overwrite: Boolean = true,
55+
compress: Boolean = f.name.let { it.endsWith(".zip") || it.endsWith(".gz") }) {
56+
57+
// ensure that file is not yet there or overwrite flag is set
58+
require(!f.isFile || overwrite) { "$f is present already. Use overwrite=true to enforce file replacement." }
59+
60+
val p = if (!compress) PrintWriter(f) else BufferedWriter(OutputStreamWriter(GZIPOutputStream(FileOutputStream(f))))
61+
62+
toList().forEach { p.write(transform(it) + separator) }
63+
64+
p.close()
65+
}
3266

3367

34-
fun Sequence<String>.print() = forEach { println(it) }

src/main/kotlin/kscript/experimental/Incubator.kt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,8 @@ fun processStdin(trafo: (String) -> String) {
2525
println(trafo(it))
2626
}
2727
}
28+
29+
30+
operator fun File.div(childName: String): File {
31+
return this.resolve(childName)
32+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package kscript.experimental
2+
3+
import kscript.resolveArgFile
4+
5+
/**
6+
* @author Holger Brandl
7+
*/
8+
9+
abstract class OneLinerContext(args: Array<String>) {
10+
11+
val arg by lazy { resolveArgFile(args) }
12+
// val stdin by lazy { kscript.stdin }
13+
14+
init {
15+
apply(arg)
16+
}
17+
18+
abstract fun apply(lines: Sequence<String>)
19+
}
20+
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package kscript.examples
2+
3+
import kscript.*
4+
import kscript.experimental.OneLinerContext
5+
6+
/**
7+
* @author Holger Brandl
8+
*/
9+
10+
val args = arrayOf("flights.txt")
11+
12+
object AwkExample : OneLinerContext(args) {
13+
override fun apply(lines: Sequence<String>) {
14+
lines.split().map({ it[1] }, { it[2] }).print()
15+
16+
17+
lines.split().filter { it[3].matches("UA".toRegex()) }.print()
18+
19+
// remove header
20+
lines.drop(1).split().filter { it[3].matches("UA".toRegex()) }.print()
21+
22+
23+
lines.awk { it[1] + it[2] }
24+
25+
lines.awk { it[1] + it[2] }
26+
lines.awk { it[3] }
27+
28+
// http://stackoverflow.com/questions/15361632/delete-a-column-with-awk-or-sed
29+
lines.split().map { it.toMutableList().apply { removeAt(3) } }.print()
30+
31+
32+
// http@ //tuxgraphics.org/~guido/scripts/awk-one-liner.html
33+
// Print the next two (i=2) lines after the line matching regexp:
34+
// awk '/regexp/{i=2;next;}{if(i){i--; print;}}' file.txt
35+
36+
37+
fun <T> List<T>.sliding(windowSize: Int): List<List<T>> {
38+
return this.dropLast(windowSize - 1).mapIndexed { i, s -> this.subList(i, i + windowSize) }
39+
}
40+
41+
val regex = "a[bc]+".toRegex()
42+
43+
resolveArgFile(args).
44+
toList().sliding(4).
45+
filter { it[0].matches(regex) }.
46+
flatten().print()
47+
48+
49+
50+
// number lines (from http://tuxgraphics.org/~guido/scripts/awk-one-liner.html)
51+
// awk '{print FNR "\t" $0}'
52+
lines.mapIndexed { num, line -> num.toString() + " " + line }.print()
53+
54+
55+
// Remove duplicate consecutive lines (uniq):
56+
// awk 'a !~ $0{print}; {a=$0}'
57+
58+
59+
// Delete trailing white space (spaces, tabs)
60+
// awk '{sub(/[ \t]*$/, "");print}' file.txt
61+
lines.map { it.trim() }.print()
62+
63+
// Count lines (wc -l):
64+
// awk 'END{print NR}'
65+
println(lines.fold(0) { cur, _ -> cur + 1 })
66+
println(lines.mapIndexed { i, _ -> i }.last())
67+
// don't
68+
println(lines.toList().size)
69+
70+
71+
72+
// Print the lines from a file starting at the line matching "start" until the line matching "stop":
73+
// awk '/start/,/stop/' file.txt
74+
75+
lines.dropWhile { it.startsWith("foo") }.takeWhile { it.startsWith("bar") }.print()
76+
77+
val arg by lazy { resolveArgFile(args) }
78+
arg.filter { true }.print()
79+
}
80+
}
81+
//file:///Users/brandl/Desktop/awk_cheatsheets.pdf
82+
83+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package kscript.examples
2+
3+
import kscript.add
4+
import kscript.split
5+
import kscript.experimental.OneLinerContext
6+
import kscript.print
7+
8+
/**
9+
* One-liner kscript example. To ease devlopment simply extend [OneLinerContext] as shown, which will provide
10+
* the same context as `kscript` when running in single line mode.
11+
*
12+
* @author Holger Brandl
13+
*/
14+
15+
fun main(args: Array<String>) {
16+
17+
object : OneLinerContext(args) {
18+
19+
override fun apply(lines: Sequence<String>) {
20+
lines.split().filter { it[3] == "UA" }.add { it[3] + ":" + it[3] }.print()
21+
}
22+
}
23+
}

src/test/kotlin/kscript/test/FileApiTest.java

Lines changed: 0 additions & 8 deletions
This file was deleted.

src/test/kotlin/kscript/test/SupportApiTest.kt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package kscript.test
22

3-
import io.kotlintest.matchers.shouldBe
43
import io.kotlintest.specs.StringSpec
54
import kscript.*
65
import java.io.File
@@ -18,8 +17,8 @@ class SupportApiTest : StringSpec() { init {
1817
// "hello".length shouldBe 5
1918
// stopIfNot("FOO"=="BAR"){"condition not met"}
2019
println("current dir is " + File(".").absolutePath)
21-
argMap("src/test/resources/flights_head.txt") { it.split("\t")[7] }
22-
argFilter("src/test/resources/flights_head.txt") { it.split("\t")[7] == "UA" }
20+
linesFrom(File("src/test/resources/flights_head.txt")).map { it.split("\t")[7] }.print()
21+
linesFrom(File("src/test/resources/flights_head.txt")).filter { it.split("\t")[7] == "UA" }.print()
2322
}
2423
}
2524
}

0 commit comments

Comments
 (0)