AI SearchでSharepoint上のデータをベクトル化する - CCCMKホールディングス TECH Labの Tech Blog

こんにちは、CCCMKホールディングスTECH LABの井上です。

前回の記事の続きになります。

AI SearchのSharePointからのインデックス作成をやってみました - CCCMKホールディングス TECH Labの Tech Blog

前回、Sharepointをデータソースにしてベクトル化したときにメタデータが取得できないというところで終了したのですが、できました。
使ったAPIのリクエスト内容は以下のものになります。

データソース作成

POST https://{{search_service}}.search.windows.net/datasources?api-version={{api-version}}
Content-Type: application/json
api-key: [admin key]
{
    "name": "{{index_name}}-datasource",
    "type": "sharepoint",
    "credentials": {
        "connectionString": "{{env_sharepoint_connection_string}}"
    },
    "container": {
        "name": "defaultSiteLibrary",
        "query": null
    }
}

2.インデックス作成

POST https://{{search_service}}.search.windows.net/indexes?api-version={{api-version}}
Content-Type: application/json
api-key: [admin key]
{
    "name": "{{index_name}}",
    "defaultScoringProfile": null,
    "fields": [
        {
            "name": "chunk_id",
            "type": "Edm.String",
            "searchable": true,
            "filterable": true,
            "retrievable": true,
            "sortable": true,
            "facetable": true,
            "key": true,
            "indexAnalyzer": null,
            "searchAnalyzer": null,
            "analyzer": "keyword",
            "normalizer": null,
            "dimensions": null,
            "vectorSearchProfile": null,
            "synonymMaps": []
        },
        {
            "name": "parent_id",
            "type": "Edm.String",
            "searchable": true,
            "filterable": true,
            "retrievable": true,
            "sortable": true,
            "facetable": true,
            "key": false,
            "indexAnalyzer": null,
            "searchAnalyzer": null,
            "analyzer": null,
            "normalizer": null,
            "dimensions": null,
            "vectorSearchProfile": null,
            "synonymMaps": []
        },
        {
            "name": "chunk",
            "type": "Edm.String",
            "searchable": true,
            "filterable": false,
            "retrievable": true,
            "sortable": false,
            "facetable": false,
            "key": false,
            "indexAnalyzer": null,
            "searchAnalyzer": null,
            "analyzer": "ja.lucene",
            "normalizer": null,
            "dimensions": null,
            "vectorSearchProfile": null,
            "synonymMaps": []
        },
        {
            "name": "title",
            "type": "Edm.String",
            "searchable": true,
            "filterable": true,
            "retrievable": true,
            "sortable": false,
            "facetable": false,
            "key": false,
            "indexAnalyzer": null,
            "searchAnalyzer": null,
            "analyzer": "ja.lucene",
            "normalizer": null,
            "dimensions": null,
            "vectorSearchProfile": null,
            "synonymMaps": []
        },
        {
            "name": "vector",
            "type": "Collection(Edm.Single)",
            "searchable": true,
            "filterable": false,
            "retrievable": true,
            "sortable": false,
            "facetable": false,
            "key": false,
            "indexAnalyzer": null,
            "searchAnalyzer": null,
            "analyzer": null,
            "normalizer": null,
            "dimensions": 1536,
            "vectorSearchProfile": "{{index_name}}-profile",
            "synonymMaps": []
        },
        {
            "name": "url_path",
            "type": "Edm.String",
            "searchable": true,
            "filterable": true,
            "retrievable": true,
            "sortable": false,
            "facetable": false,
            "key": false,
            "indexAnalyzer": null,
            "searchAnalyzer": null,
            "analyzer": "ja.lucene",
            "normalizer": null,
            "dimensions": null,
            "vectorSearchProfile": null,
            "synonymMaps": []
        }
    ],
    "scoringProfiles": [],
    "corsOptions": {
        "allowedOrigins": [
            "*"
        ],
        "maxAgeInSeconds": 300
    },
    "suggesters": [],
    "analyzers": [],
    "normalizers": [],
    "tokenizers": [],
    "tokenFilters": [],
    "charFilters": [],
    "encryptionKey": null,
    "similarity": {
        "@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
        "k1": null,
        "b": null
    },
    "semantic": {
        "defaultConfiguration": "{{index_name}}-semantic-configuration",
        "configurations": [
            {
                "name": "{{index_name}}-semantic-configuration",
                "prioritizedFields": {
                    "titleField": {
                        "fieldName": "title"
                    },
                    "prioritizedContentFields": [
                        {
                            "fieldName": "chunk"
                        }
                    ],
                    "prioritizedKeywordsFields": []
                }
            }
        ]
    },
    "vectorSearch": {
        "algorithms": [
            {
                "name": "{{index_name}}-algorithm",
                "kind": "hnsw",
                "hnswParameters": {
                    "metric": "cosine",
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 500
                },
                "exhaustiveKnnParameters": null
            }
        ],
        "profiles": [
            {
                "name": "{{index_name}}-profile",
                "algorithm": "{{index_name}}-algorithm",
                "vectorizer": "{{index_name}}-vectorizer"
            }
        ],
        "vectorizers": [
            {
                "name": "{{index_name}}-vectorizer",
                "kind": "azureOpenAI",
                "azureOpenAIParameters": {
                    "resourceUri": "{{resourceUri}}",
                    "deploymentId": "{{deploymentId}}",
                    "apiKey": "{{openAIapiKey}}",
                    "authIdentity": null
                },
                "customWebApiParameters": null
            }
        ]
    }
}

3.スキルセット作成

PUT https://{{search_service}}.search.windows.net/skillsets/{{index_name}}-skillset?api-version={{api-version}}
Content-Type: application/json
api-key: [admin key]
{
  "name": "{{index_name}}-skillset",
  "description": "Skillset to chunk documents and generate embeddings",
  "skills": [
    {
      "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
      "name": "#1",
      "description": null,
      "context": "/document/pages/*",
      "resourceUri": "{{resourceUri}}",
      "apiKey": "{{openAIapiKey}}",
      "deploymentId": "{{deploymentId}}",
      "inputs": [
        {
          "name": "text",
          "source": "/document/pages/*"
        }
      ],
      "outputs": [
        {
          "name": "embedding",
          "targetName": "vector"
        }
      ],
      "authIdentity": null
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
      "name": "#2",
      "description": "Split skill to chunk documents",
      "context": "/document",
      "defaultLanguageCode": "ja",
      "textSplitMode": "pages",
      "maximumPageLength": 2000,
      "pageOverlapLength": 500,
      "maximumPagesToTake": 0,
      "inputs": [
        {
          "name": "text",
          "source": "/document/content"
        }
      ],
      "outputs": [
        {
          "name": "textItems",
          "targetName": "pages"
        }
      ]
    }
  ],
  "cognitiveServices": null,
  "knowledgeStore": null,
  "indexProjections": {
    "selectors": [
      {
        "targetIndexName": "{{index_name}}",
        "parentKeyFieldName": "parent_id",
        "sourceContext": "/document/pages/*",
        "mappings": [
          {
            "name": "chunk",
            "source": "/document/pages/*",
            "sourceContext": null,
            "inputs": []
          },
          {
            "name": "vector",
            "source": "/document/pages/*/vector",
            "sourceContext": null,
            "inputs": []
          },
          {
            "name": "title",
            "source": "/document/metadata_spo_item_name",
            "sourceContext": null,
            "inputs": []
          },
          {
            "name": "url_path",
            "source": "/document/metadata_spo_item_weburi",
            "sourceContext": null,
            "inputs": []
          }
        ]
      }
    ],
    "parameters": {
      "projectionMode": "skipIndexingParentDocuments"
    }
  },
  "encryptionKey": null
}

4.インデクサー作成

POST https://{{search_service}}.search.windows.net/indexers?api-version={{api-version}}
Content-Type: application/json
api-key: [admin key]
{
    "name": "{{index_name}}-indexer",
    "description": null,
    "dataSourceName": "{{index_name}}-datasource",
    "skillsetName": "{{index_name}}-skillset",
    "targetIndexName": "{{index_name}}",
    "disabled": null,
    "schedule": null,
    "parameters": {
        "batchSize": null,
        "maxFailedItems": null,
        "maxFailedItemsPerBatch": null,
        "base64EncodeKeys": null,
        "configuration": {
            "dataToExtract": "contentAndMetadata",
            "parsingMode": "default"
        }
    },
    "fieldMappings": [],
    "outputFieldMappings": [],
    "cache": null,
    "encryptionKey": null
}

※ {{}}内の変数を適宜環境に合わせて設定してください。

ちなみにこのボディ部はAzureのPortalで「データのインポートとベクター化」からインデックスを作成したときに生成されるJSONがベースになっています。スキルセットにおいて下記の変更をすればSharepointのメタデータが反映されます。

また、一つのインデックスに対してSharepointとBlobそれぞれをデータソースとしてデータを取り込むことも試しました。こちらは詳細は省きますが、SharepointとBlob用にそれぞれデータソース、スキルセット、インデクサーを作成します。インデックスはどちらか一方のみ作成します。インデクサーにある項目のtargetIndexNameを同一のものを指定することで一つのインデックスにデータが格納されます。
検索した結果が以下のようになっており、SharepointとBlobのデータが格納されてることが確認できます。

このリンクをクリックするとSharepoint上のデータはブラウザで表示されますがBlobのデータは表示されません。Blobのデータを表示（取得）するにはURLの後ろにパラメータとしてSASトークンを付与します。（Edgeでは表示できるがChromeではダウンロードになる）SASトークンの発行は詳細は省きますが、Azure Portal上でストレージアカウント画面にあるShared Access Signatureで行います。
SASトークンをURLに表示したくないのでPOSTでページを開こうとも試みましたがBlobがPOSTを許可してないようで開けませんでした。

前回と今回で、以下のことが確認できました。
・Sharepoint上のデータのベクトル化
・Sharepoint上のメタデータの取得
・異なるデータソースから1つのインデックスの作成