import(urlsource"trpc.group/trpc-go/trpc-agent-go/knowledge/source/url")urlSrcAlias:=urlsource.New([]string{"https://trpc-go.com/docs/api.md"},// Identifier URL (for document ID and metadata)urlsource.WithContentFetchingURL([]string{"https://github.com/trpc-group/trpc-go/raw/main/docs/api.md"}),// Actual content fetching URLurlsource.WithName("TRPC API Docs"),urlsource.WithMetadataValue("source","github"),)
Note: When using WithContentFetchingURL, the identifier URL should retain the file information from the content fetching URL, for example:
- Correct: Identifier URL is https://trpc-go.com/docs/api.md, fetching URL is https://github.com/.../docs/api.md
- Incorrect: Identifier URL is https://trpc-go.com, loses document path information
Auto Source
Intelligent type detection, automatically selects processor:
import("trpc.group/trpc-go/trpc-agent-go/knowledge"openaiembedder"trpc.group/trpc-go/trpc-agent-go/knowledge/embedder/openai""trpc.group/trpc-go/trpc-agent-go/knowledge/source"filesource"trpc.group/trpc-go/trpc-agent-go/knowledge/source/file"dirsource"trpc.group/trpc-go/trpc-agent-go/knowledge/source/dir"urlsource"trpc.group/trpc-go/trpc-agent-go/knowledge/source/url"autosource"trpc.group/trpc-go/trpc-agent-go/knowledge/source/auto"vectorinmemory"trpc.group/trpc-go/trpc-agent-go/knowledge/vectorstore/inmemory")// Combine multiple sourcessources:=[]source.Source{fileSrc,dirSrc,urlSrc,autoSrc}embedder:=openaiembedder.New(openaiembedder.WithModel("text-embedding-3-small"))vectorStore:=vectorinmemory.New()// Pass to Knowledgekb:=knowledge.New(knowledge.WithEmbedder(embedder),knowledge.WithVectorStore(vectorStore),knowledge.WithSources(sources),)// Load all sourcesiferr:=kb.Load(ctx);err!=nil{log.Fatalf("Failed to load knowledge base: %v",err)}
Configuring Metadata
To enable filter functionality, it's recommended to add rich metadata when creating document sources.
sources:=[]source.Source{// File source with metadatafilesource.New([]string{"./docs/api.md"},filesource.WithName("API Documentation"),filesource.WithMetadataValue("category","documentation"),filesource.WithMetadataValue("topic","api"),filesource.WithMetadataValue("service_type","gateway"),filesource.WithMetadataValue("protocol","trpc-go"),filesource.WithMetadataValue("version","v1.0"),),// Directory source with metadatadirsource.New([]string{"./tutorials"},dirsource.WithName("Tutorials"),dirsource.WithMetadataValue("category","tutorial"),dirsource.WithMetadataValue("difficulty","beginner"),dirsource.WithMetadataValue("topic","programming"),),// URL source with metadataurlsource.New([]string{"https://example.com/wiki/rpc"},urlsource.WithName("RPC Wiki"),urlsource.WithMetadataValue("category","encyclopedia"),urlsource.WithMetadataValue("source_type","web"),urlsource.WithMetadataValue("topic","rpc"),urlsource.WithMetadataValue("language","zh"),),}
Transformer is used to preprocess and postprocess content before and after document chunking. This is particularly useful for cleaning text extracted from PDFs, web pages, and other sources, removing excess whitespace, duplicate characters, and other noise.
import(filesource"trpc.group/trpc-go/trpc-agent-go/knowledge/source/file"dirsource"trpc.group/trpc-go/trpc-agent-go/knowledge/source/dir"urlsource"trpc.group/trpc-go/trpc-agent-go/knowledge/source/url"autosource"trpc.group/trpc-go/trpc-agent-go/knowledge/source/auto""trpc.group/trpc-go/trpc-agent-go/knowledge/transform")// Create transformersfilter:=transform.NewCharFilter("\t")// Remove tabsdedup:=transform.NewCharDedup(" ","\n")// Merge consecutive spaces and newlines// File source with transformersfileSrc:=filesource.New([]string{"./data/document.pdf"},filesource.WithTransformers(filter,dedup),)// Directory source with transformersdirSrc:=dirsource.New([]string{"./docs"},dirsource.WithTransformers(filter,dedup),)// URL source with transformersurlSrc:=urlsource.New([]string{"https://example.com/article"},urlsource.WithTransformers(filter,dedup),)// Auto source with transformersautoSrc:=autosource.New([]string{"./mixed-content"},autosource.WithTransformers(filter,dedup),)
// First remove tabs, then merge consecutive spacesfilter:=transform.NewCharFilter("\t")dedup:=transform.NewCharDedup(" ")src:=filesource.New([]string{"./data/messy.txt"},filesource.WithTransformers(filter,dedup),// Executed in order)
Typical Use Cases
Scenario
Recommended Configuration
PDF text cleanup
CharDedup(" ", "\n") - Merge excess spaces and newlines from PDF extraction
Web content processing
CharFilter("\t") + CharDedup(" ") - Remove tabs and merge spaces
Since the PDF reader depends on third-party libraries, to avoid introducing unnecessary dependencies in the main module, the PDF reader uses a separate go.mod.
To support PDF file reading, manually import the PDF reader package in your code: