From: sgf Date: Wed, 3 Aug 2022 14:08:59 +0000 (+0300) Subject: Initial fix-columns version. X-Git-Url: https://gitweb.sgf-dma.tk/?a=commitdiff_plain;h=32d07f4d63a013053b870ecce4c916fc5d5a2700;p=mir_japanese-words.git Initial fix-columns version. --- diff --git a/fix-columns/.gitignore b/fix-columns/.gitignore new file mode 100644 index 0000000..ffb1dc9 --- /dev/null +++ b/fix-columns/.gitignore @@ -0,0 +1,3 @@ +.*.sw* +fix-columns +2.csv diff --git a/fix-columns/1.csv b/fix-columns/1.csv new file mode 100644 index 0000000..20efb30 --- /dev/null +++ b/fix-columns/1.csv @@ -0,0 +1,8 @@ +номер,слова,частицы,иероглифы,переводы,связанные,теги +,に・と,会う ,あう,"встречаться, видеться", 迎える, +,,青 ・ 青い ,あお ・ あおい," синий цвет, синий, голубой,бледный +,",, +,,赤 ・ 赤い ,あか ・ あかい," красный цвет, красный ",, +,,明るい, , あかるい светлый,くらい , +,,秋 ,,あき осень,, +,が,開く ,,"  あく あける  открываться, раскрываться ",, diff --git a/fix-columns/fix-columns.go b/fix-columns/fix-columns.go new file mode 100644 index 0000000..ebc0273 --- /dev/null +++ b/fix-columns/fix-columns.go @@ -0,0 +1,134 @@ + +package main + +import ( + "fmt" + "encoding/csv" + "os" + "io" + "unicode" + "unicode/utf8" +) + +// Select prefix satisfying predicate. If entire string satisfies predicate, +// index will equal to len(str) (i.e. effectively be out of range). +func many (str string, p func (rune) bool) int { + i := 0 + w := 0 + for i = 0; i < len(str); i += w { + var c rune + c, w = utf8.DecodeRuneInString(str[i:]) + if !p(c) { + break + } + } + return i +} +// Select substring satisfying predicate, _starting from the beginning_ of string +// and _ignoring_ leading and trailing unicode spaces. +func selectSubstring(str string, p func (rune) bool) (int, int) { + // Skip leading space. + start := many(str, unicode.IsSpace) + //r, _ := utf8.DecodeRuneInString(str[start:]) + //fmt.Printf("h2: start = %v, %q\n", start, r) + + cur, pos1 := start, 0 + pos2 := many(str[start:], p) + //r, _ = utf8.DecodeRuneInString(str[start+pos2:]) + //fmt.Printf("h2: pos2 = %v, %q\n", pos2, r) + + // Update position only, if hiragana word follows spaces. So, trailing + // spaces are skipped. + for pos2 != 0 { + + cur += pos1 + pos2 + //r, _ = utf8.DecodeRuneInString(str[cur:]) + //fmt.Printf("h2: cur = %v, %q\n", cur, r) + + pos1 = many(str[cur:], unicode.IsSpace) + //r, _ = utf8.DecodeRuneInString(str[cur+pos1:]) + //fmt.Printf("h2: pos1 = %v, %q\n", pos1, r) + + pos2 = many(str[cur+pos1:], p) + //r, _ = utf8.DecodeRuneInString(str[cur+pos1+pos2:]) + //fmt.Printf("h2: pos2 = %v, %q\n", pos2, r) + } + + return start, cur +} + +func isHiragana (c rune) bool { + return unicode.Is(unicode.Hiragana, c) +} + +func isNotSpace (c rune) bool { + return !unicode.IsSpace(c) +} + +func isAny (c rune) bool { + return true +} + +func isEmptyField (field string) bool { + for _, c := range field { + if !unicode.IsSpace(c) { + return false + } + } + return true +} + +func splitReading (record []string) ([]string) { + for i := 0; i < len(record); i += 1 { + if i == 2 && isEmptyField(record[2]) { + field := record[3] + start, end := selectSubstring(field, isHiragana) + //r, _ := utf8.DecodeRuneInString(field[start:]) + //fmt.Printf("hiragana: start = %v (%q), end = %v\n", start, r, end) + record[2] = field[start:end] + record[3] = field[end:] + continue + } + field := record[i] + start, end := selectSubstring(field, isNotSpace) + record[i] = field[start:end] + } + + return record +} + +func main() { + fnIn, err := os.Open("1.csv") + if err != nil { + fmt.Printf("Error: %v\n", err) + return + } + + fnOut, err := os.OpenFile("2.csv", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0755) + if err != nil { + fmt.Printf("Error: %v\n", err) + return + } + + csvIn := csv.NewReader(fnIn) + csvOut := csv.NewWriter(fnOut) + defer csvOut.Flush() + + for { + l, err := csvIn.Read() + if err != nil { + if err == io.EOF { + break + } + fmt.Printf("Error: %v\n", err) + return + } + fmt.Printf("row: %v\n", l) + err = csvOut.Write(splitReading(l)) + if err != nil { + fmt.Printf("Error: %v\n", err) + return + } + } +} + diff --git a/fix-columns/go.mod b/fix-columns/go.mod new file mode 100644 index 0000000..5b9c885 --- /dev/null +++ b/fix-columns/go.mod @@ -0,0 +1,3 @@ +module fix-columns + +go 1.15