Spaces:
Build error
Build error
| import pandas as pd | |
| from sudachipy import tokenizer, dictionary | |
| import neologdn | |
| import os # osモジュールをインポート | |
| # --- スクリプトのディレクトリを基準にパスを設定 --- | |
| # このスクリプト自身の絶対パスを取得 | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| # 作業ディレクトリをこのスクリプトがあるディレクトリに変更 | |
| os.chdir(script_dir) | |
| def load_preprocessed_data(): | |
| """ | |
| DataSet.xlsxを読み込み、前処理(欠損値除去、ラベルエンコーディング、形態素解析)を行い、 | |
| 処理済みのDataFrameと元のデータ数を返します。 | |
| """ | |
| # --- Step 1: データ読み込み --- | |
| df = pd.read_excel("DataSet.xlsx") | |
| initial_count = len(df) | |
| # --- Step 2: 欠損除去 --- | |
| df = df.dropna(subset=["コメント", "性別", "年代"]).reset_index(drop=True) | |
| # --- Step 2.5: 表記揺れ正規化 --- | |
| df["コメント"] = df["コメント"].astype(str).apply(neologdn.normalize) | |
| # --- Step 3: 年代と性別のラベルを別々に作成 --- | |
| df["年代性別"] = df["年代"] + " " + df["性別"] | |
| # 各年代ごとに二値分類ラベルを作成(その年代かどうか) | |
| age_categories = ["10代", "20代", "30代", "40代", "50代", "60代"] | |
| for age in age_categories: | |
| df[f"{age}_label"] = (df["年代"] == age).astype(int) | |
| # 性別ラベルのマッピング | |
| gender_categories = ["male", "female"] | |
| gender_label_map = {cat: idx for idx, cat in enumerate(gender_categories)} | |
| df["性別_label"] = df["性別"].map(gender_label_map) | |
| # 統合ラベルも残す(後方互換性のため) | |
| combined_categories = [ | |
| "10代 male", "10代 female", | |
| "20代 male", "20代 female", | |
| "30代 male", "30代 female", | |
| "40代 male", "40代 female", | |
| "50代 male", "50代 female", | |
| "60代 male", "60代 female" | |
| ] | |
| combined_label_map = {cat: idx for idx, cat in enumerate(combined_categories)} | |
| df["年代性別_label"] = df["年代性別"].map(combined_label_map) | |
| # --- Step 4: Sudachipyによる形態素解析(表層 + 品詞)--- | |
| tokenizer_obj = dictionary.Dictionary().create() | |
| mode = tokenizer.Tokenizer.SplitMode.C | |
| def sudachi_tokenize_with_pos(text): | |
| tokens = tokenizer_obj.tokenize(text, mode) | |
| return [ | |
| f"{m.surface()}/{m.part_of_speech()[0]}" | |
| for m in tokens if m.surface().strip() | |
| ] | |
| df["tokens"] = df["コメント"].apply(sudachi_tokenize_with_pos) | |
| df["text"] = df["tokens"].apply(lambda x: " ".join(x)) | |
| return df, initial_count | |
| if __name__ == '__main__': | |
| df, initial_count = load_preprocessed_data() | |
| # --- 表示 --- | |
| print(f"✅ Excel内の全データ数: {initial_count} 件") | |
| print(f"\n✅ 前処理後のデータ数: {len(df)} 件") | |
| print("==== Sudachipyによる処理結果の一部 ====") | |
| for i in range(min(10, len(df))): # 先頭10件まで表示 | |
| print(f"\n【{i+1}件目】") | |
| print(f"[原文(正規化後)] {df.loc[i, 'コメント']}") | |
| print(f"[形態素+品詞] {df.loc[i, 'tokens']}") | |
| print(f"[テキスト形式] {df.loc[i, 'text']}") | |
| print(f"[年代性別] {df.loc[i, '年代性別']}") | |
| print(f"[年代] {df.loc[i, '年代']}") | |
| print(f" 10代_label: {df.loc[i, '10代_label']}, 20代_label: {df.loc[i, '20代_label']}, 30代_label: {df.loc[i, '30代_label']}") | |
| print(f" 40代_label: {df.loc[i, '40代_label']}, 50代_label: {df.loc[i, '50代_label']}, 60代_label: {df.loc[i, '60代_label']}") | |
| print(f"[性別] {df.loc[i, '性別']} -> [性別_label] {df.loc[i, '性別_label']}") | |