Hijiki-HF commited on
Commit
881644a
·
1 Parent(s): b728562

add: preprocess data str

Browse files
src/collect/collect_data.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def collect_data() -> pd.DataFrame:
4
+ # まず手打ちでデータを作成
5
+ with open("../data/one_book.txt", "r") as f:
6
+ data = f.read().splitlines()
7
+
8
+ book_title = data[0]
9
+ body = "".join(data[1:]).replace("\n", "").replace("##", "\n##")
10
+
11
+ data_dict = {"book_title": book_title, "body": body}
12
+ df = pd.DataFrame(data_dict, index=[0])
13
+ df.to_csv("../data/articles.csv", index=False)
14
+
15
+ if __name__ == "__main__":
16
+ collect_data()
src/data_dummy/dummy_articles.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ book_title,body
2
+ 坊ちゃん,ここに感想記事の本文を貼り付ける