Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
348 changes: 288 additions & 60 deletions packages/web/src/features/search/fileSourceApi.ts
Original file line number Diff line number Diff line change
@@ -1,84 +1,312 @@
import 'server-only';
import { fileNotFound, ServiceError, unexpectedError } from "../../lib/serviceError";
import { fileNotFound, notFound, ServiceError } from "../../lib/serviceError";
import { FileSourceRequest, FileSourceResponse } from "./types";
import { isServiceError } from "../../lib/utils";
import { search } from "./searchApi";
import { getCodeHostBrowseFileAtBranchUrl } from "../../lib/utils";
import { sew } from "@/actions";
import { withOptionalAuthV2 } from "@/withAuthV2";
import { QueryIR } from './ir';
import escapeStringRegexp from "escape-string-regexp";

// @todo (bkellam) #574 : We should really be using `git show <hash>:<path>` to fetch file contents here.
// This will allow us to support permalinks to files at a specific revision that may not be indexed
// by zoekt. We should also refactor this out of the /search folder.
import { Repo } from "@sourcebot/db";
import { env } from "@sourcebot/shared";
import path from 'path';
import { simpleGit } from 'simple-git';

/**
* Retrieves the file source (content) for a given file in a repository.
*
* This function uses `git show` to fetch file contents directly from the git repository,
* which is more reliable than using zoekt search (which requires shards to be loaded).
*/
export const getFileSource = async ({ fileName, repository, branch }: FileSourceRequest): Promise<FileSourceResponse | ServiceError> => sew(() =>
withOptionalAuthV2(async () => {
const query: QueryIR = {
and: {
children: [
{
repo: {
regexp: `^${escapeStringRegexp(repository)}$`,
},
},
{
substring: {
pattern: fileName,
case_sensitive: true,
file_name: true,
content: false,
}
},
...(branch ? [{
branch: {
pattern: branch,
exact: true,
},
}]: [])
]
}
}

const searchResponse = await search({
queryType: 'ir',
query,
options: {
matches: 1,
whole: true,
}
withOptionalAuthV2(async ({ org, prisma }) => {
// Query the database to get the repository record
const repo = await prisma.repo.findFirst({
where: {
name: repository,
orgId: org.id,
},
});

if (isServiceError(searchResponse)) {
return searchResponse;
if (!repo) {
return notFound();
}

const files = searchResponse.files;
const { path: repoPath } = getRepoPath(repo);
const revisionName = branch ?? 'HEAD';

if (!files || files.length === 0) {
// Use git show to fetch file contents
const git = simpleGit().cwd(repoPath);

let source: string;
try {
// git show <revision>:<path> returns the file content at the given revision
source = await git.show([`${revisionName}:${fileName}`]);
} catch (_error) {
// If the file doesn't exist at this revision, git show will throw an error
return fileNotFound(fileName, repository);
}

const file = files[0];
const source = file.content ?? '';
const language = file.language;
// Determine the language from the file extension
const language = getLanguageFromFileName(fileName);

const repoInfo = searchResponse.repositoryInfo.find((repo) => repo.id === file.repositoryId);
if (!repoInfo) {
// This should never happen.
return unexpectedError("Repository info not found");
}
// Construct the web URL for viewing this file on the code host
const webUrl = getCodeHostBrowseFileAtBranchUrl({
webUrl: repo.webUrl,
codeHostType: repo.external_codeHostType,
branchName: revisionName,
filePath: fileName,
});

return {
source,
language,
path: fileName,
repository,
repositoryCodeHostType: repoInfo.codeHostType,
repositoryDisplayName: repoInfo.displayName,
repositoryWebUrl: repoInfo.webUrl,
repositoryCodeHostType: repo.external_codeHostType,
repositoryDisplayName: repo.displayName ?? undefined,
repositoryWebUrl: repo.webUrl ?? undefined,
branch,
webUrl: file.webUrl,
webUrl,
} satisfies FileSourceResponse;

}));

/**
* Returns the path to the git repository on disk.
*
* @note: This is duplicated from the `getRepoPath` function in the
* backend's `utils.ts` file and `fileTree/api.ts`. Eventually we should
* move this to a shared package.
*/
const getRepoPath = (repo: Repo): { path: string, isReadOnly: boolean } => {
// If we are dealing with a local repository, then use that as the path.
// Mark as read-only since we aren't guaranteed to have write access to the local filesystem.
const cloneUrl = new URL(repo.cloneUrl);
if (repo.external_codeHostType === 'genericGitHost' && cloneUrl.protocol === 'file:') {
return {
path: cloneUrl.pathname,
isReadOnly: true,
}
}

const reposPath = path.join(env.DATA_CACHE_DIR, 'repos');

return {
path: path.join(reposPath, repo.id.toString()),
isReadOnly: false,
}
}

/**
* Mapping of file extensions to GitHub Linguist language names.
* This is used for syntax highlighting in the UI.
*
* @see https://github.com/github-linguist/linguist/blob/main/lib/linguist/languages.yml
*/
const extensionToLanguageMap: Record<string, string> = {
// JavaScript/TypeScript
'.js': 'JavaScript',
'.jsx': 'JavaScript',
'.ts': 'TypeScript',
'.tsx': 'TSX',
'.mjs': 'JavaScript',
'.cjs': 'JavaScript',
'.mts': 'TypeScript',
'.cts': 'TypeScript',

// Web
'.html': 'HTML',
'.htm': 'HTML',
'.css': 'CSS',
'.scss': 'SCSS',
'.sass': 'Sass',
'.less': 'Less',
'.vue': 'Vue',
'.svelte': 'Svelte',

// Python
'.py': 'Python',
'.pyw': 'Python',
'.pyx': 'Cython',
'.pxd': 'Cython',

// Java/JVM
'.java': 'Java',
'.kt': 'Kotlin',
'.kts': 'Kotlin',
'.scala': 'Scala',
'.groovy': 'Groovy',
'.gradle': 'Gradle',
'.clj': 'Clojure',
'.cljs': 'Clojure',

// C/C++
'.c': 'C',
'.h': 'C',
'.cpp': 'C++',
'.cc': 'C++',
'.cxx': 'C++',
'.hpp': 'C++',
'.hxx': 'C++',
'.hh': 'C++',

// C#/F#
'.cs': 'C#',
'.fs': 'F#',
'.fsx': 'F#',

// Go
'.go': 'Go',
'.mod': 'Go Module',

// Rust
'.rs': 'Rust',

// Ruby
'.rb': 'Ruby',
'.erb': 'HTML+ERB',
'.rake': 'Ruby',
'.gemspec': 'Ruby',

// PHP
'.php': 'PHP',
'.phtml': 'PHP',

// Swift/Objective-C
'.swift': 'Swift',
'.m': 'Objective-C',
'.mm': 'Objective-C++',

// Shell
'.sh': 'Shell',
'.bash': 'Shell',
'.zsh': 'Shell',
'.fish': 'fish',
'.ps1': 'PowerShell',
'.psm1': 'PowerShell',
'.bat': 'Batchfile',
'.cmd': 'Batchfile',

// Data/Config
'.json': 'JSON',
'.jsonc': 'JSON with Comments',
'.json5': 'JSON5',
'.yaml': 'YAML',
'.yml': 'YAML',
'.toml': 'TOML',
'.xml': 'XML',
'.ini': 'INI',
'.cfg': 'INI',
'.conf': 'INI',
'.properties': 'Java Properties',

// Markdown/Text
'.md': 'Markdown',
'.markdown': 'Markdown',
'.mdx': 'MDX',
'.rst': 'reStructuredText',
'.txt': 'Text',
'.tex': 'TeX',
'.latex': 'TeX',

// SQL
'.sql': 'SQL',
'.mysql': 'SQL',
'.pgsql': 'PLpgSQL',

// Other languages
'.r': 'R',
'.R': 'R',
'.lua': 'Lua',
'.pl': 'Perl',
'.pm': 'Perl',
'.ex': 'Elixir',
'.exs': 'Elixir',
'.erl': 'Erlang',
'.hrl': 'Erlang',
'.hs': 'Haskell',
'.lhs': 'Literate Haskell',
'.ml': 'OCaml',
'.mli': 'OCaml',
'.elm': 'Elm',
'.dart': 'Dart',
'.v': 'Verilog',
'.sv': 'SystemVerilog',
'.vhd': 'VHDL',
'.vhdl': 'VHDL',

// DevOps/Infrastructure
'.dockerfile': 'Dockerfile',
'.tf': 'HCL',
'.hcl': 'HCL',
'.proto': 'Protocol Buffer',
'.graphql': 'GraphQL',
'.gql': 'GraphQL',

// Build files
'.cmake': 'CMake',
'.make': 'Makefile',
'.mk': 'Makefile',

// Misc
'.diff': 'Diff',
'.patch': 'Diff',
'.zig': 'Zig',
'.nim': 'Nim',
'.nix': 'Nix',
'.prisma': 'Prisma',
'.wasm': 'WebAssembly',
'.wat': 'WebAssembly',
'.sol': 'Solidity',
};

/**
* Special filename to language mappings for files without extensions
* or with specific names that override extension-based detection.
*/
const filenameToLanguageMap: Record<string, string> = {
'Dockerfile': 'Dockerfile',
'Makefile': 'Makefile',
'GNUmakefile': 'Makefile',
'makefile': 'Makefile',
'Rakefile': 'Ruby',
'Gemfile': 'Ruby',
'Podfile': 'Ruby',
'Vagrantfile': 'Ruby',
'BUILD': 'Starlark',
'BUILD.bazel': 'Starlark',
'WORKSPACE': 'Starlark',
'WORKSPACE.bazel': 'Starlark',
'.gitignore': 'Ignore List',
'.gitattributes': 'Git Attributes',
'.editorconfig': 'EditorConfig',
'.babelrc': 'JSON',
'.eslintrc': 'JSON',
'.prettierrc': 'JSON',
'tsconfig.json': 'JSON with Comments',
'jsconfig.json': 'JSON with Comments',
'.env': 'Dotenv',
'.env.local': 'Dotenv',
'.env.development': 'Dotenv',
'.env.production': 'Dotenv',
};

/**
* Determines the GitHub Linguist language name from a file name.
* Used for syntax highlighting in the UI.
*
* @param fileName - The name or path of the file
* @returns The Linguist language name, or empty string if unknown
*/
const getLanguageFromFileName = (fileName: string): string => {
// Get the base name of the file (without directory path)
const baseName = path.basename(fileName);

// Check for special filenames first
if (filenameToLanguageMap[baseName]) {
return filenameToLanguageMap[baseName];
}

// Get the file extension (including the dot)
const ext = path.extname(fileName).toLowerCase();

// Return the language for the extension, or empty string if unknown
return extensionToLanguageMap[ext] ?? '';
};