Skip to content

Commit da4b276

Browse files
committed
a lot of files for tests
1 parent 7ae95e8 commit da4b276

114 files changed

Lines changed: 1992 additions & 344 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,3 +518,6 @@ coverage*.info
518518

519519
# Temporary test files
520520
*.tmp
521+
522+
# Generated test asset catalog
523+
tests/MarkItDown.Tests/Generated/

AGENTS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ If I tell you to remember something, you do the same, update
1212
- MIME handling: always use `ManagedCode.MimeTypes` for MIME constants, lookups, and validation logic.
1313
- Treat this repository as a high-fidelity port of `microsoft-markitdown`: every test fixture copied from the upstream `tests/test_files/` directory must be referenced by .NET tests (either as positive conversions or explicit unsupported cases). No orphaned fixtures.
1414
- CSV parsing must use the `Sep` library; avoid Sylvan or other CSV parsers for new or updated code.
15+
- Format integration tasks: never break the project or existing tests, and validate new format handling against real sample files.
16+
- Test fixtures must be surfaced via the auto-generated `TestAssetCatalog`; add binaries under `TestFiles/` and rely on its constants in tests.
1517

1618
# Repository Guidelines
1719

src/MarkItDown.Cli/ConversionService.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ public async Task<ConversionSummary> ConvertFilesAsync(IReadOnlyList<string> fil
2020

2121
Directory.CreateDirectory(outputDirectory);
2222
var results = new List<ConversionResult>(files.Count);
23-
var markItDown = new MarkItDown(options);
23+
var markItDown = new MarkItDownClient(options);
2424

2525
for (var index = 0; index < files.Count; index++)
2626
{
@@ -53,7 +53,7 @@ public async Task<ConversionSummary> ConvertUrlAsync(string url, string outputDi
5353
}
5454

5555
Directory.CreateDirectory(outputDirectory);
56-
var markItDown = new MarkItDown(options);
56+
var markItDown = new MarkItDownClient(options);
5757
var conversion = await markItDown.ConvertFromUrlAsync(url, cancellationToken: cancellationToken).ConfigureAwait(false);
5858
var outputPath = await WriteMarkdownAsync(conversion.Markdown, DeriveFileNameFromUrl(url, conversion.Title), outputDirectory, cancellationToken).ConfigureAwait(false);
5959
var result = new ConversionResult(url, outputPath, true, null, conversion.Segments.Count);

src/MarkItDown.Cli/MarkItDown.Cli.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
<IncludeAllContentForSelfExtract>true</IncludeAllContentForSelfExtract>
1212
<EnableCompressionInSingleFile>true</EnableCompressionInSingleFile>
1313
<SelfContained Condition="'$(SelfContained)' == ''">false</SelfContained>
14+
<IsPackable>false</IsPackable>
1415
</PropertyGroup>
1516

1617
<ItemGroup>

src/MarkItDown/Converters/XlsxConverter.cs

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -212,37 +212,51 @@ private static string ConvertWorksheetToMarkdown(WorksheetPart worksheetPart, st
212212

213213
private static string GetCellValue(Cell cell, SharedStringTable? stringTable)
214214
{
215-
if (cell.CellValue == null)
216-
return "";
215+
var dataType = cell.DataType?.Value;
216+
var cellValue = cell.CellValue?.Text;
217217

218-
var value = cell.CellValue.Text;
219-
220-
if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString)
218+
if (dataType == CellValues.InlineString)
221219
{
222-
// Look up the value in the shared string table
223-
if (stringTable != null && int.TryParse(value, out var stringIndex))
220+
return cell.InlineString?.InnerText ?? cell.InnerText ?? string.Empty;
221+
}
222+
223+
if (!string.IsNullOrEmpty(cellValue))
224+
{
225+
if (dataType == CellValues.SharedString)
224226
{
225-
var stringItem = stringTable.Elements<SharedStringItem>().ElementAtOrDefault(stringIndex);
226-
if (stringItem != null)
227+
if (stringTable != null && int.TryParse(cellValue, out var stringIndex))
227228
{
228-
return stringItem.InnerText;
229+
var stringItem = stringTable.Elements<SharedStringItem>().ElementAtOrDefault(stringIndex);
230+
if (stringItem is not null)
231+
{
232+
return stringItem.InnerText;
233+
}
229234
}
230235
}
236+
else if (dataType == CellValues.Boolean)
237+
{
238+
return cellValue == "0" ? "FALSE" : "TRUE";
239+
}
240+
else if (dataType == CellValues.Date && double.TryParse(cellValue, out var dateValue))
241+
{
242+
return DateTime.FromOADate(dateValue).ToString("yyyy-MM-dd");
243+
}
244+
245+
return cellValue;
231246
}
232-
else if (cell.DataType != null && cell.DataType.Value == CellValues.Boolean)
247+
248+
if (cell.CellFormula is not null && !string.IsNullOrWhiteSpace(cell.CellFormula.Text))
233249
{
234-
return value == "0" ? "FALSE" : "TRUE";
250+
return "=" + cell.CellFormula.Text.Trim();
235251
}
236-
else if (cell.DataType != null && cell.DataType.Value == CellValues.Date)
252+
253+
if (cell.InlineString is not null)
237254
{
238-
if (double.TryParse(value, out var dateValue))
239-
{
240-
var date = DateTime.FromOADate(dateValue);
241-
return date.ToString("yyyy-MM-dd");
242-
}
255+
return cell.InlineString.InnerText;
243256
}
244257

245-
return value ?? "";
258+
var innerText = cell.InnerText;
259+
return innerText ?? string.Empty;
246260
}
247261

248262
private static void CreateMarkdownTable(List<List<string>> tableData, StringBuilder result)

src/MarkItDown/MarkItDownClient.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
using MarkItDown.Intelligence.Providers.Azure;
2121
using MarkItDown.Intelligence.Providers.Google;
2222
using MarkItDown.YouTube;
23-
using Microsoft.Extensions.Logging;
2423

2524
namespace MarkItDown;
2625

tests/MarkItDown.Tests/Converters/YouTubeUrlConverterTests.cs

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
using System.Collections.Generic;
2+
using System.Globalization;
3+
using System.IO;
4+
using System.Linq;
25
using System.Text;
6+
using System.Text.Json;
37
using MarkItDown.Converters;
48
using MarkItDown.Intelligence.Models;
59
using MarkItDown.YouTube;
610
using Shouldly;
711
using Xunit;
12+
using MarkItDown.Tests;
813

914
namespace MarkItDown.Tests.Converters;
1015

@@ -50,6 +55,72 @@ private sealed class NullYouTubeMetadataProvider : IYouTubeMetadataProvider
5055
}
5156
}
5257

58+
[Fact]
59+
public async Task ConvertAsync_WithRecordedMetadata_RendersVideoDetails()
60+
{
61+
var metadata = LoadRecordedMetadata();
62+
var provider = new FixtureYouTubeMetadataProvider(metadata);
63+
var converter = new YouTubeUrlConverter(provider);
64+
var streamInfo = new StreamInfo(url: "https://www.youtube.com/watch?v=8hnpIIamb6k");
65+
66+
var result = await converter.ConvertAsync(Stream.Null, streamInfo);
67+
68+
result.Title.ShouldBe(metadata.Title);
69+
result.Markdown.ShouldContain(metadata.Title);
70+
result.Markdown.ShouldContain("Managed Code");
71+
result.Markdown.ShouldContain("**Views:** 483");
72+
result.Markdown.ShouldContain("SOLID Principles");
73+
result.Markdown.ShouldContain("## Captions");
74+
result.Segments.ShouldContain(segment => segment.Type == SegmentType.Metadata);
75+
result.Segments.Count(s => s.Type == SegmentType.Audio).ShouldBe(metadata.Captions.Count);
76+
77+
var firstCaption = result.Segments.First(s => s.Type == SegmentType.Audio);
78+
firstCaption.StartTime.ShouldBe(TimeSpan.FromSeconds(0));
79+
firstCaption.Markdown.ShouldContain("SOLID principles");
80+
}
81+
82+
private static YouTubeMetadata LoadRecordedMetadata()
83+
{
84+
var jsonPath = TestAssetLoader.GetAssetPath(TestAssetCatalog.YoutubeSolidPrinciplesJson);
85+
using var stream = File.OpenRead(jsonPath);
86+
var fixture = JsonSerializer.Deserialize<YouTubeMetadataFixture>(stream, new JsonSerializerOptions
87+
{
88+
PropertyNameCaseInsensitive = true
89+
});
90+
91+
if (fixture is null)
92+
{
93+
throw new InvalidOperationException("Failed to deserialize recorded YouTube metadata fixture.");
94+
}
95+
96+
var captions = fixture.Captions.Select(c => new YouTubeCaptionSegment(
97+
c.Text,
98+
c.Start is not null ? TimeSpan.FromSeconds(c.Start.Value) : null,
99+
c.End is not null ? TimeSpan.FromSeconds(c.End.Value) : null,
100+
c.Metadata ?? new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
101+
)).ToList();
102+
103+
var thumbnails = fixture.Thumbnails.Select(uri => new Uri(uri)).ToList();
104+
var additional = fixture.AdditionalMetadata ?? new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
105+
106+
return new YouTubeMetadata(
107+
VideoId: fixture.VideoId,
108+
Title: fixture.Title,
109+
ChannelTitle: fixture.ChannelTitle,
110+
WatchUrl: new Uri(fixture.WatchUrl),
111+
ChannelUrl: new Uri(fixture.ChannelUrl),
112+
Duration: fixture.DurationSeconds is not null ? TimeSpan.FromSeconds(fixture.DurationSeconds.Value) : null,
113+
UploadDate: fixture.UploadDate is not null ? DateTimeOffset.Parse(fixture.UploadDate, CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal) : null,
114+
ViewCount: fixture.ViewCount,
115+
LikeCount: fixture.LikeCount,
116+
Tags: fixture.Tags ?? Array.Empty<string>(),
117+
Description: fixture.Description,
118+
Thumbnails: thumbnails,
119+
Captions: captions,
120+
AdditionalMetadata: additional
121+
);
122+
}
123+
53124
private sealed class StubYouTubeMetadataProvider : IYouTubeMetadataProvider
54125
{
55126
public Task<YouTubeMetadata?> GetVideoAsync(string videoId, CancellationToken cancellationToken = default)
@@ -81,4 +152,45 @@ private sealed class StubYouTubeMetadataProvider : IYouTubeMetadataProvider
81152
return Task.FromResult<YouTubeMetadata?>(metadata);
82153
}
83154
}
155+
156+
private sealed class FixtureYouTubeMetadataProvider : IYouTubeMetadataProvider
157+
{
158+
private readonly YouTubeMetadata metadata;
159+
160+
public FixtureYouTubeMetadataProvider(YouTubeMetadata metadata)
161+
{
162+
this.metadata = metadata;
163+
}
164+
165+
public Task<YouTubeMetadata?> GetVideoAsync(string videoId, CancellationToken cancellationToken = default)
166+
{
167+
return Task.FromResult<YouTubeMetadata?>(metadata);
168+
}
169+
}
170+
171+
private sealed class YouTubeMetadataFixture
172+
{
173+
public string VideoId { get; init; } = string.Empty;
174+
public string Title { get; init; } = string.Empty;
175+
public string ChannelTitle { get; init; } = string.Empty;
176+
public string WatchUrl { get; init; } = string.Empty;
177+
public string ChannelUrl { get; init; } = string.Empty;
178+
public double? DurationSeconds { get; init; }
179+
public string? UploadDate { get; init; }
180+
public long? ViewCount { get; init; }
181+
public long? LikeCount { get; init; }
182+
public IReadOnlyList<string>? Tags { get; init; }
183+
public string? Description { get; init; }
184+
public IReadOnlyList<string> Thumbnails { get; init; } = Array.Empty<string>();
185+
public IReadOnlyList<YouTubeCaptionFixture> Captions { get; init; } = Array.Empty<YouTubeCaptionFixture>();
186+
public IReadOnlyDictionary<string, string>? AdditionalMetadata { get; init; }
187+
}
188+
189+
private sealed class YouTubeCaptionFixture
190+
{
191+
public string Text { get; init; } = string.Empty;
192+
public double? Start { get; init; }
193+
public double? End { get; init; }
194+
public IReadOnlyDictionary<string, string>? Metadata { get; init; }
195+
}
84196
}

tests/MarkItDown.Tests/DocxConverterTests.cs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,33 @@ public async Task ConvertAsync_DocxWithImages_ExecutesPipelineAndCapturesArtifac
4545
var trailing = segment.Markdown[(placeholderIndex + image.PlaceholderMarkdown!.Length)..];
4646
trailing.TrimStart('\r', '\n').ShouldStartWith("DOCX ENRICHED");
4747
}
48+
49+
[Fact]
50+
public async Task ConvertAsync_ComplexDocx_PreservesRichContent()
51+
{
52+
var client = new MarkItDownClient();
53+
var path = TestAssetLoader.GetAssetPath(TestAssetCatalog.ComplexDocx);
54+
55+
var result = await client.ConvertAsync(path);
56+
57+
result.Title.ShouldBe("Rich Text Formatting");
58+
result.Markdown.ShouldContain("https://example.com/docs");
59+
result.Markdown.ShouldContain("| Metric | Q1 | Q2 | Total |");
60+
result.Markdown.ShouldContain("Equation: x^2 + y^2 = z^2");
61+
result.Markdown.ShouldContain("• Bullet list item one");
62+
result.Markdown.ShouldContain("![Image 1]");
63+
}
64+
65+
[Fact]
66+
public async Task ConvertAsync_BrokenDocx_RaisesFileConversionError()
67+
{
68+
var client = new MarkItDownClient();
69+
var path = TestAssetLoader.GetAssetPath(TestAssetCatalog.BrokenDocx);
70+
71+
var exception = await Should.ThrowAsync<UnsupportedFormatException>(async () => await client.ConvertAsync(path));
72+
exception.InnerException.ShouldNotBeNull();
73+
exception.InnerException.ShouldBeOfType<AggregateException>();
74+
var aggregate = (AggregateException)exception.InnerException!;
75+
aggregate.InnerExceptions.ShouldContain(e => e is FileConversionException);
76+
}
4877
}

0 commit comments

Comments
 (0)