richardkovacs commited on
Commit
11d3b20
1 Parent(s): fca027b

feat: add dataset preparator script

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. dataset.py +20 -0
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  venv
2
  flagged
3
  .env
 
 
1
  venv
2
  flagged
3
  .env
4
+ *.csv
dataset.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import pandas as pd
3
+
4
+ divider = 1
5
+ data_size = 25000 // divider
6
+ case_size = data_size // 2
7
+
8
+ dataset = load_dataset("imdb")
9
+
10
+ train_df = pd.DataFrame(dataset['train'])
11
+ test_df = pd.DataFrame(dataset['test'])
12
+
13
+ train_df = train_df.iloc[::divider, :]
14
+ test_df = test_df.iloc[::divider, :]
15
+
16
+ train_df['label'] = train_df['label'].apply(lambda x: 'NEGATIVE' if x == 0 else 'POSITIVE')
17
+ test_df['label'] = test_df['label'].apply(lambda x: 'NEGATIVE' if x == 0 else 'POSITIVE')
18
+
19
+ train_df.to_csv(f'imdb_train_{case_size}_{case_size}.csv', index=False)
20
+ test_df.to_csv(f'imdb_test_{case_size}_{case_size}.csv', index=False)