Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: refactoring datasource and document model #49

Merged
merged 6 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added assets/connector/google_drive/document.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/connector/google_drive/form.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/connector/google_drive/icon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/connector/google_drive/pdf.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/connector/google_drive/presentation.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/connector/google_drive/spreadsheet.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
File renamed without changes
File renamed without changes
17 changes: 13 additions & 4 deletions docs/content.en/docs/references/api/document.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Below is the field description for the document.

| **Field** | **Type** | **Description** |
|-------------------------|--------------------|-----------------------------------------------------------------------------------------------------|
| `source` | `string` | The source of the document, e.g., `google_drive`. |
| `source` | `object` | The source of the document. |
| `category` | `string` | Primary category of the document, e.g., `report`. |
| `categories` | `array[string]` | List of categories the document belongs to, e.g., `["business", "quarterly_reports"]`. |
| `cover` | `string` (URL) | URL to the cover image of the document. |
Expand All @@ -31,7 +31,8 @@ Below is the field description for the document.
| `owner.avatar` | `string` (URL) | URL to the owner's avatar image. |
| `owner.username` | `string` | Username of the owner, e.g., `jdoe`. |
| `owner.userid` | `string` | User ID of the owner, e.g., `user123`. |
| `metadata` | `object` | Additional metadata about the document. |
| `metadata` | `object` | Additional accessible metadata (e.g., file version, permissions). |
| `payload` | `object` | Additional store-only metadata (e.g., file binary data). |
| `last_updated_by` | `object` | Information about the last user who updated the document. |
| `last_updated_by.user` | `object` | Details about the user who last updated the document. |
| `last_updated_by.user.avatar` | `string` (URL) | URL to the avatar of the last editor. |
Expand All @@ -45,7 +46,11 @@ Below is the field description for the document.
```shell
//request
curl -H 'Content-Type: application/json' -XPOST http://localhost:2900/document/ -d '{
"source": "google_drive",
"source": {
"type":"connector",
"name":"google_drive",
"id":"e806831dacc3",
},
"category": "report",
"categories": ["business", "quarterly_reports"],
"cover": "https://example.com/images/report_cover.jpg",
Expand Down Expand Up @@ -111,7 +116,11 @@ curl -XGET http://localhost:2900/document/cso9vr3q50k38nobvmcg
"id": "cso9vr3q50k38nobvmcg",
"created": "2024-11-10T19:58:36.009086+08:00",
"updated": "2024-11-10T19:58:36.009092+08:00",
"source": "google_drive",
"source": {
"type":"connector",
"name":"google_drive",
"id":"e806831dacc3",
}
...OMITTED...
,
"found": true
Expand Down
6 changes: 5 additions & 1 deletion docs/content.en/docs/references/api/search.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ curl -XGET http://localhost:2900/query/_suggest\?query\=buss
{
"suggestion": "Q3 Business Report",
"score": 0.99,
"source": "google_drive"
"source": {
"type":"connector",
"name":"google_drive",
"id":"e806831dacc3",
}
}
]
}
Expand Down
2 changes: 1 addition & 1 deletion modules/assistant/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ func (h APIHandler) sendChatMessage(w http.ResponseWriter, req *http.Request, ps

webSocketID:=req.Header.Get("WEBSOCKET-SESSION-ID")

log.Info(req.Header)
log.Trace(req.Header)

sessionID := ps.MustGetParameter("session_id")
var request MessageRequest
Expand Down
19 changes: 19 additions & 0 deletions modules/common/connector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/* Copyright © INFINI LTD. All rights reserved.
* Web: https://infinilabs.com
* Email: hello#infini.ltd */

package common

type Connector struct {
CombinedFullText
Name string `json:"name,omitempty" elastic_mapping:"name:{type:keyword,copy_to:combined_fulltext}"` // Source of the document (e.g., "github", "google_drive", "dropbox")
Description string `json:"description,omitempty" elastic_mapping:"description:{type:keyword,copy_to:combined_fulltext}"` // Source of the document (e.g., "github", "google_drive", "dropbox")
Category string `json:"category,omitempty" elastic_mapping:"category:{type:keyword,copy_to:combined_fulltext}"` // Primary category of the document (e.g., "report", "article")
Icon string `json:"icon,omitempty" elastic_mapping:"icon:{enabled:false}"` // Thumbnail image URL, for preview purposes
Tags []string `json:"tags,omitempty" elastic_mapping:"tags:{type:keyword,copy_to:combined_fulltext}"` // Tags or keywords associated with the document, for easier retrieval
URL string `json:"url,omitempty" elastic_mapping:"url:{enabled:false}"` // Direct link to the document, if available

Assets struct {
Icons map[string]string //icon_key -> URL
}
}
66 changes: 44 additions & 22 deletions modules/common/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,54 @@
package common

import (
"infini.sh/framework/core/orm"
"time"
)

type RichLabel struct {
Label string `json:"label,omitempty" elastic_mapping:"label:{type:keyword,copy_to:combined_fulltext}"`
Icon string `json:"icon,omitempty" elastic_mapping:"icon:{enabled:false}"` // Icon Key, need work with datasource's assets to get the icon url
}

type DataSource struct {
Type string `json:"type,omitempty" elastic_mapping:"type:{type:keyword}"` // Type of the datasource, eg: connector
Name string `json:"name,omitempty" elastic_mapping:"name:{type:keyword}"` // Source of the document (e.g., "github", "google_drive", "dropbox")
ID string `json:"id,omitempty" elastic_mapping:"id:{type:keyword}"` // ID of this connector instance
}

type Document struct {
orm.ORMObjectBase // Embedding ORM base for persistence-related fields

Source string `json:"source,omitempty" elastic_mapping:"source:{type:keyword,copy_to:combined_fulltext}"` // Source of the document (e.g., "github", "google_drive", "dropbox")
Category string `json:"category,omitempty" elastic_mapping:"category:{type:keyword,copy_to:combined_fulltext}"` // Primary category of the document (e.g., "report", "article")
Categories []string `json:"categories,omitempty" elastic_mapping:"categories:{type:keyword,copy_to:combined_fulltext}"` // Full hierarchy of categories, useful for detailed classification
Title string `json:"title,omitempty" elastic_mapping:"title:{type:text,copy_to:combined_fulltext,fields:{keyword: {type: keyword}, pinyin: {type: text, analyzer: pinyin_analyzer}}}"` // Document title
Summary string `json:"summary,omitempty" elastic_mapping:"summary:{type:text,copy_to:combined_fulltext}"` // Brief summary or description of the document
Lang string `json:"lang,omitempty" elastic_mapping:"lang:{type:keyword,copy_to:combined_fulltext}"` // Language code (e.g., "en", "fr")
Content string `json:"content,omitempty" elastic_mapping:"content:{type:text,copy_to:combined_fulltext}"` // Document content for full-text indexing
Icon string `json:"icon,omitempty" elastic_mapping:"icon:{enabled:false}"` // Thumbnail image URL, for preview purposes
Thumbnail string `json:"thumbnail,omitempty" elastic_mapping:"thumbnail:{enabled:false}"` // Thumbnail image URL, for preview purposes
Cover string `json:"cover,omitempty" elastic_mapping:"cover:{enabled:false}"` // Cover image URL, if applicable
Type string `json:"type,omitempty" elastic_mapping:"type:{type:keyword,copy_to:combined_fulltext}"` // Document type, such as PDF, Docx, etc.
Owner *UserInfo `json:"owner,omitempty" elastic_mapping:"owner:{type:object}"` // Document author or owner
Tags []string `json:"tags,omitempty" elastic_mapping:"tags:{type:keyword,copy_to:combined_fulltext}"` // Tags or keywords associated with the document, for easier retrieval
URL string `json:"url,omitempty" elastic_mapping:"url:{enabled:false}"` // Direct link to the document, if available
Size int `json:"size,omitempty" elastic_mapping:"size:{type:long}"` // File size in bytes, if applicable
Metadata map[string]interface{} `json:"metadata,omitempty" elastic_mapping:"metadata:{enabled:false}"` // Additional source-specific metadata (e.g., file version, permissions)
LastUpdatedBy *EditorInfo `json:"last_updated_by,omitempty" elastic_mapping:"last_updated_by:{type:object}"` // Struct containing last update information

CombinedFullText string `json:"-" elastic_mapping:"combined_fulltext:{type:text,index_prefixes:{},index_phrases:true, analyzer:combined_text_analyzer }"`
CombinedFullText

Source DataSource `json:"source,omitempty" elastic_mapping:"source:{type:object}"` // Source of the document

Type string `json:"type,omitempty" elastic_mapping:"type:{type:keyword,copy_to:combined_fulltext}"` // Document type, such as PDF, Docx, etc.

Category string `json:"category,omitempty" elastic_mapping:"category:{type:keyword,copy_to:combined_fulltext}"` // Primary category of the document (e.g., "report", "article")
Subcategory string `json:"subcategory,omitempty" elastic_mapping:"subcategory:{type:keyword,copy_to:combined_fulltext}"` // Secondary category of the document (e.g., "report", "article")

//use categories for very complex hierarchy categories
Categories []string `json:"categories,omitempty" elastic_mapping:"categories:{type:keyword,copy_to:combined_fulltext}"` // Full hierarchy of categories, useful for detailed classification

//use rich_categories for icon need to display for each category
RichCategories []RichLabel `json:"rich_categories,omitempty" elastic_mapping:"rich_categories:{type:object}"` // Full hierarchy of categories, useful for detailed classification, with icon decoration

Title string `json:"title,omitempty" elastic_mapping:"title:{type:text,copy_to:combined_fulltext,fields:{keyword: {type: keyword}, pinyin: {type: text, analyzer: pinyin_analyzer}}}"` // Document title
Summary string `json:"summary,omitempty" elastic_mapping:"summary:{type:text,copy_to:combined_fulltext}"` // Brief summary or description of the document

Lang string `json:"lang,omitempty" elastic_mapping:"lang:{type:keyword,copy_to:combined_fulltext}"` // Language code (e.g., "en", "fr")
Content string `json:"content,omitempty" elastic_mapping:"content:{type:text,copy_to:combined_fulltext}"` // Document content for full-text indexing

Icon string `json:"icon,omitempty" elastic_mapping:"icon:{enabled:false}"` // Icon Key, need work with datasource's assets to get the icon url, if it is a full url, then use it directly
Thumbnail string `json:"thumbnail,omitempty" elastic_mapping:"thumbnail:{enabled:false}"` // Thumbnail image URL, for preview purposes
Cover string `json:"cover,omitempty" elastic_mapping:"cover:{enabled:false}"` // Cover image URL, if applicable

Owner *UserInfo `json:"owner,omitempty" elastic_mapping:"owner:{type:object}"` // Document author or owner

Tags []string `json:"tags,omitempty" elastic_mapping:"tags:{type:keyword,copy_to:combined_fulltext}"` // Tags or keywords associated with the document, for easier retrieval
URL string `json:"url,omitempty" elastic_mapping:"url:{enabled:false}"` // Direct link to the document, if available
Size int `json:"size,omitempty" elastic_mapping:"size:{type:long}"` // File size in bytes, if applicable

LastUpdatedBy *EditorInfo `json:"last_updated_by,omitempty" elastic_mapping:"last_updated_by:{type:object}"` // Struct containing last update information

}

type EditorInfo struct {
Expand Down
16 changes: 16 additions & 0 deletions modules/common/fulltext.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/* Copyright © INFINI LTD. All rights reserved.
* Web: https://infinilabs.com
* Email: hello#infini.ltd */

package common

import "infini.sh/framework/core/orm"


type CombinedFullText struct {
orm.ORMObjectBase // Embedding ORM base for persistence-related fields
CombinedFullText string `json:"-" elastic_mapping:"combined_fulltext:{type:text,index_prefixes:{},index_phrases:true, analyzer:combined_text_analyzer }"`

Metadata map[string]interface{} `json:"metadata,omitempty" elastic_mapping:"metadata:{type:object}"` // Additional accessible metadata (e.g., file version, permissions)
Payload map[string]interface{} `json:"payload,omitempty" elastic_mapping:"payload:{enabled:false}"` // Additional store-only metadata (e.g., file binary data)
}
10 changes: 5 additions & 5 deletions modules/search/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,11 @@ func (h APIHandler) search(w http.ResponseWriter, req *http.Request, ps httprout
templatedQuery := orm.TemplatedQuery{}
templatedQuery.TemplateID = "coco-query-string"
templatedQuery.Parameters = util.MapStr{
"from": from,
"size": size,
"field": field,
"query": query,
"source": strings.Split(source, ","),
"from": from,
"size": size,
"field": field,
"query": query,
"source": strings.Split(source, ","),
}
q.TemplatedQuery = &templatedQuery
} else {
Expand Down
70 changes: 47 additions & 23 deletions plugins/connectors/google_drive/files.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,22 @@ import (
"time"
)

func getIcon(fileType string) string {
switch fileType {
case "application/vnd.google-apps.document":
return "document"
case "application/vnd.google-apps.form":
return "form"
case "application/pdf":
return "pdf"
case "application/vnd.google-apps.presentation":
return "presentation"
case "application/vnd.google-apps.spreadsheet":
return "spreadsheet"
default:
return "default" // default fallback
}
}

func (this *Plugin) startIndexingFiles(tenantID,userID string,tok *oauth2.Token) {
var filesProcessed =0
Expand Down Expand Up @@ -56,7 +72,7 @@ func (this *Plugin) startIndexingFiles(tenantID,userID string,tok *oauth2.Token)

log.Tracef("get last modified time: %v",lastModifiedTimeStr)

if lastModifiedTimeStr !=""{
if lastModifiedTimeStr !=""{ //TODO, if the files are newly shared and with old timestamp and we may missed
// Parse last indexed time
parsedTime, err := time.Parse(time.RFC3339Nano, lastModifiedTimeStr)
if err != nil {
Expand Down Expand Up @@ -113,7 +129,11 @@ func (this *Plugin) startIndexingFiles(tenantID,userID string,tok *oauth2.Token)

// Map Google Drive file to Document struct
document := common.Document{
Source: "google_drive",
Source: common.DataSource{
//ID: "",//TODO
Name: "google_drive",
Type: "connector",
},
Title: i.Name,
Summary: i.Description,
Type: i.MimeType,
Expand All @@ -124,30 +144,32 @@ func (this *Plugin) startIndexingFiles(tenantID,userID string,tok *oauth2.Token)
UserName: i.Owners[0].DisplayName,
UserID: i.Owners[0].EmailAddress,
},
Icon: i.IconLink,
Icon: getIcon(i.MimeType),
Thumbnail: i.ThumbnailLink,
Metadata: util.MapStr{
"drive_id": i.DriveId,
"file_id": i.Id,
"email": i.Owners[0].EmailAddress,
"file_extension": i.FileExtension,
"kind": i.Kind,
"shared": i.Shared,
"spaces": i.Spaces,
"starred": i.Starred,
"web_view_link": i.WebViewLink,
"labels": i.LabelInfo,
"parents": i.Parents,
"permissions": i.Permissions,
"permission_ids": i.PermissionIds,
"properties": i.Properties,
},

}

document.ID = i.Id
document.ID = i.Id //add tenant namespace and then hash
document.Created = createdAt
document.Updated = updatedAt

document.Metadata= util.MapStr{
"drive_id": i.DriveId,
"file_id": i.Id,
"email": i.Owners[0].EmailAddress,
"file_extension": i.FileExtension,
"kind": i.Kind,
"shared": i.Shared,
"spaces": i.Spaces,
"starred": i.Starred,
"web_view_link": i.WebViewLink,
"labels": i.LabelInfo,
"parents": i.Parents,
"permissions": i.Permissions,
"permission_ids": i.PermissionIds,
"properties": i.Properties,
}

if i.LastModifyingUser != nil {
document.LastUpdatedBy = &common.EditorInfo{
UserInfo: common.UserInfo{
Expand All @@ -159,21 +181,23 @@ func (this *Plugin) startIndexingFiles(tenantID,userID string,tok *oauth2.Token)
}
}

document.Payload= util.MapStr{}

// Handle optional fields
if i.SharingUser != nil {
document.Metadata["sharingUser"] = common.UserInfo{
document.Payload["sharingUser"] = common.UserInfo{
UserAvatar: i.SharingUser.PhotoLink,
UserName: i.SharingUser.DisplayName,
UserID: i.SharingUser.EmailAddress,
}
}

if i.VideoMediaMetadata != nil {
document.Metadata["video_metadata"] = i.VideoMediaMetadata
document.Payload["video_metadata"] = i.VideoMediaMetadata
}

if i.ImageMediaMetadata != nil {
document.Metadata["image_metadata"] = i.ImageMediaMetadata
document.Payload["image_metadata"] = i.ImageMediaMetadata
}

// Convert to JSON and push to queue
Expand Down
Loading
Loading