360 lines
12 KiB
C#
360 lines
12 KiB
C#
using Microsoft.AspNetCore.Http;
|
|
using Microsoft.Extensions.Primitives;
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
using System;
|
|
using System.Net;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using System.Threading.Tasks;
|
|
using Ganss.XSS;
|
|
using System.IO;
|
|
using Markdig;
|
|
using Markdig.Syntax;
|
|
using Blog3000.Shared;
|
|
using Microsoft.Extensions.Logging;
|
|
using AngleSharp;
|
|
using System.Net.Http;
|
|
using Microsoft.CodeAnalysis.CSharp.Syntax;
|
|
using Blog3000.Client;
|
|
|
|
namespace Blog3000.Server.MiddleWares
|
|
{
|
|
public class SEOMgr: IDisposable
|
|
{
|
|
private readonly ILogger logger;
|
|
private readonly BlogEnv blogEnv;
|
|
private readonly BlogPostRepo blogPostRepo;
|
|
|
|
private readonly string urlPrefix;
|
|
|
|
private readonly static object siteMapLock = new object();
|
|
private readonly string sitemapCacheFile;
|
|
private string siteMapCache;
|
|
|
|
//
|
|
// Update when SEO-Output changes
|
|
//
|
|
private static DateTime MINSEODATE = new DateTime(2020, 07, 03);
|
|
|
|
public SEOMgr(ILogger<SEOMgr> logger, BlogEnv blogEnv, BlogPostRepo blogPostRepo)
|
|
{
|
|
this.logger = logger;
|
|
this.blogEnv = blogEnv;
|
|
this.blogPostRepo = blogPostRepo;
|
|
this.urlPrefix = blogEnv.BlogConfig.VisibleUrlPrefix;
|
|
|
|
this.sitemapCacheFile = System.IO.Path.Combine(
|
|
(string)AppDomain.CurrentDomain.GetData("AppDataPath"),
|
|
"sitemapcache.xml");
|
|
|
|
blogPostRepo.BlogPostsChanged += BlogPostRepo_BlogPostsChanged;
|
|
BlogPostRepo_BlogPostsChanged(null, null);
|
|
}
|
|
|
|
|
|
public void Dispose()
|
|
{
|
|
blogPostRepo.BlogPostsChanged -= BlogPostRepo_BlogPostsChanged;
|
|
}
|
|
|
|
|
|
public string GetSiteMap()
|
|
{
|
|
if (siteMapCache == null)
|
|
{
|
|
UpdateSiteMap();
|
|
}
|
|
|
|
return siteMapCache;
|
|
}
|
|
|
|
|
|
public StringBuilder BuildBlogPostOutput(BlogPostHeader bp)
|
|
{
|
|
string markdown = blogPostRepo.GetText(bp);
|
|
|
|
var title = InputSanitizer.Default.SanitizeText(bp.Title, true);
|
|
var keywords = "";
|
|
if (bp.Topics != null)
|
|
{
|
|
foreach (var t in bp.Topics)
|
|
{
|
|
keywords += (t.Replace("'", "").Replace("#", "")) + ","; // TODO
|
|
}
|
|
}
|
|
keywords = InputSanitizer.Default.SanitizeText(keywords);
|
|
|
|
var abstr = "" + bp.Abstract;
|
|
abstr = InputSanitizer.Default.SanitizeText(abstr, true).Replace("'", ""); // TODO
|
|
|
|
string author = null;
|
|
if (!String.IsNullOrEmpty(bp.Author))
|
|
{
|
|
blogEnv.BlogConfig?.AuthorFullnames?.TryGetValue(bp.Author, out author);
|
|
author = InputSanitizer.Default.SanitizeText(author, true);
|
|
}
|
|
|
|
|
|
var mdPipeline = new MarkdownPipelineBuilder().UseAdvancedExtensions().Build();
|
|
var mdoc = Markdig.Markdown.Parse(markdown, mdPipeline);
|
|
|
|
|
|
// Extract imagerefs for rewriting
|
|
Dictionary<string, string> imgGeysByUrls = new Dictionary<string, string>();
|
|
foreach (var r in mdoc.GetLinkReferenceDefinitions().Links)
|
|
{
|
|
if (r.Value != null && r.Value.Url != null)
|
|
{
|
|
imgGeysByUrls.Add(r.Value.Url, r.Key);
|
|
}
|
|
}
|
|
|
|
|
|
// Render markdown, rewrite image-urls
|
|
var writer = new StringWriter();
|
|
var renderer = new Markdig.Renderers.HtmlRenderer(writer);
|
|
renderer.LinkRewriter += (a) =>
|
|
{
|
|
string imgKey;
|
|
if (imgGeysByUrls.TryGetValue(a, out imgKey))
|
|
{
|
|
return $"BlogPosts/{bp.Id}/imgref/{a}";
|
|
}
|
|
return a;
|
|
};
|
|
mdPipeline.Setup(renderer);
|
|
renderer.Render(mdoc);
|
|
|
|
// Create sanitzed html
|
|
var html = writer.ToString();
|
|
html = InputSanitizer.Default.SanitizeHtml(html);
|
|
|
|
var lang = "en";
|
|
if (!String.IsNullOrEmpty(bp.Lang))
|
|
{
|
|
lang = InputSanitizer.Default.SanitizeText(bp.Lang);
|
|
}
|
|
|
|
// Build final document
|
|
StringBuilder sb = new StringBuilder();
|
|
sb.AppendLine("");
|
|
sb.AppendLine($"<html lang='{lang}'>");
|
|
sb.AppendLine("<head>");
|
|
sb.AppendLine($"<title>{title}</title>");
|
|
sb.AppendLine("<meta charset='UTF-8'/>");
|
|
sb.AppendLine("<meta name='viewport' content='width=device-width, initial-scale=1.0'>");
|
|
sb.AppendLine($"<meta name='generator' content='blog3000 build:{BuildVersion.BUILD_DATE}'>");
|
|
sb.AppendLine($"<meta name='description' content='{abstr}'></meta>");
|
|
sb.AppendLine($"<meta name='keywords' content='{keywords}'></meta>");
|
|
if (!String.IsNullOrEmpty(author))
|
|
{
|
|
sb.AppendLine($"<meta name='author' content='{author}'></meta>");
|
|
}
|
|
sb.AppendLine("</head>");
|
|
sb.AppendLine("<body>");
|
|
sb.AppendLine($"<h1>{title}</h1>");
|
|
sb.AppendLine($"<hr/>");
|
|
sb.AppendLine($"<div><i>Abstract: {abstr}</i></div>");
|
|
if (!String.IsNullOrEmpty(author))
|
|
{
|
|
sb.AppendLine($"<div><i>Author: {author}</i></div>");
|
|
}
|
|
sb.AppendLine("<hr/>");
|
|
sb.AppendLine(html);
|
|
sb.AppendLine("</body></html>");
|
|
|
|
return sb;
|
|
}
|
|
|
|
|
|
|
|
|
|
private string CreateSitemap()
|
|
{
|
|
// Create list with blogpost-links
|
|
StringBuilder sb = new StringBuilder();
|
|
sb.AppendLine("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
|
|
sb.AppendLine("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">");
|
|
|
|
DateTime? latestTime = null;
|
|
|
|
// Articles
|
|
foreach (var p in blogPostRepo.GetHeaders().Where(x => "PUBLIC".Equals(x.Access)).OrderBy(x => x.Id))
|
|
{
|
|
sb.AppendLine(" <url>");
|
|
sb.AppendLine($" <loc>{urlPrefix}/viewer/{p.Id}</loc>");
|
|
var dt = blogPostRepo.GetFileModDate(p);
|
|
if (dt != null)
|
|
{
|
|
if (MINSEODATE.CompareTo(dt) > 0) dt = MINSEODATE;
|
|
sb.AppendLine($" <lastmod>{((DateTime)dt).ToString("o")}</lastmod>");
|
|
if (latestTime == null) latestTime = dt;
|
|
else if (dt.Value.CompareTo(latestTime) > 0) latestTime = dt;
|
|
}
|
|
|
|
sb.AppendLine(" <changefreq>weekly</changefreq>");
|
|
sb.AppendLine(" </url>");
|
|
}
|
|
|
|
// Index page
|
|
sb.AppendLine(" <url>");
|
|
sb.AppendLine($" <loc>{urlPrefix}/</loc>");
|
|
sb.AppendLine(" <changefreq>weekly</changefreq>");
|
|
sb.AppendLine(" <priority>0.1</priority>");
|
|
if (latestTime != null)
|
|
{
|
|
sb.AppendLine($" <lastmod>{((DateTime)latestTime).ToString("o")}</lastmod>");
|
|
}
|
|
sb.AppendLine(" </url>");
|
|
|
|
|
|
// Done
|
|
sb.AppendLine("</urlset>");
|
|
return sb.ToString();
|
|
}
|
|
|
|
|
|
private bool isUpdatingSitemap = false;
|
|
private void UpdateSiteMap()
|
|
{
|
|
lock (siteMapLock)
|
|
{
|
|
// Check if this thread is already rebuilding the sidemap and skip.
|
|
// (Most importang skips dual-notifiy of crawlers)
|
|
// C#-lock can be reentered by the thread keeping the lock.
|
|
// This may occurr here, when the BlogPostRepo calls changed, which
|
|
// is triggered by accessing the BlogPostRepo during the
|
|
// Sitemap-creation.
|
|
if (!isUpdatingSitemap)
|
|
{
|
|
isUpdatingSitemap = true;
|
|
try
|
|
{
|
|
var oldSm = siteMapCache;
|
|
if (String.IsNullOrEmpty(oldSm))
|
|
{
|
|
oldSm = LoadSitemapCache();
|
|
}
|
|
|
|
logger.LogInformation("Rebuilding Sitemap");
|
|
var sm = CreateSitemap();
|
|
siteMapCache = sm;
|
|
SaveSitemapCache(sm);
|
|
|
|
if (!String.Equals(sm, oldSm))
|
|
{
|
|
logger.LogInformation("Sitemap changed");
|
|
Task.Run(async () =>
|
|
{
|
|
await PingCrawlersAsync();
|
|
});
|
|
}
|
|
else
|
|
{
|
|
logger.LogInformation("Sitemap not changed");
|
|
}
|
|
}
|
|
finally
|
|
{
|
|
isUpdatingSitemap = false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
logger.LogInformation("Already rebuilding Sitemap. Skip");
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
private async Task<bool> PingCrawlersAsync()
|
|
{
|
|
if (blogEnv.BlogConfig.AllowSearchBots || blogEnv.BlogConfig.AllowSearchDevBot)
|
|
{
|
|
// Notify google
|
|
var n1 = Task.Run(async () =>
|
|
{
|
|
using (var httpClient = new HttpClient())
|
|
{
|
|
/*WebUtility.UrlEncode*/
|
|
var u = Uri.EscapeUriString($"http://www.google.com/ping?sitemap={(urlPrefix + "/sitemap.xml")}");
|
|
logger.LogInformation($"Notifying crawlers: Google ({u})");
|
|
using (var response = httpClient.GetAsync(u))
|
|
{
|
|
string code = response.Result.StatusCode.ToString();
|
|
string apiResponse = await response.Result.Content.ReadAsStringAsync();
|
|
logger.LogInformation($"Google responded: {code} / {apiResponse}");
|
|
}
|
|
}
|
|
});
|
|
|
|
// Notify bing
|
|
var n2 = Task.Run(async () =>
|
|
{
|
|
using (var httpClient = new HttpClient())
|
|
{
|
|
/*WebUtility.UrlEncode*/
|
|
var u = Uri.EscapeUriString($"http://www.bing.com/ping?sitemap={(urlPrefix + "/sitemap.xml")}");
|
|
logger.LogInformation($"Notifying crawlers: Bing ({u})");
|
|
using (var response = httpClient.GetAsync(u))
|
|
{
|
|
string code = response.Result.StatusCode.ToString();
|
|
string apiResponse = await response.Result.Content.ReadAsStringAsync();
|
|
logger.LogInformation($"Bing responded: {code} / {apiResponse}");
|
|
}
|
|
}
|
|
});
|
|
|
|
await Task.WhenAll(n1, n2);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
private string LoadSitemapCache()
|
|
{
|
|
try
|
|
{
|
|
if (System.IO.File.Exists(sitemapCacheFile))
|
|
{
|
|
return System.IO.File.ReadAllText(sitemapCacheFile);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
logger.LogWarning(ex, $"Error reading sitemapCacheFile ${sitemapCacheFile}");
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
|
|
|
|
private bool SaveSitemapCache(string sitemap)
|
|
{
|
|
try
|
|
{
|
|
System.IO.File.WriteAllText(sitemapCacheFile, sitemap);
|
|
return true;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
logger.LogWarning(ex, $"Error writing sitemapCacheFile file ${sitemapCacheFile}");
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
|
|
private void BlogPostRepo_BlogPostsChanged(object sender, EventArgs e)
|
|
{
|
|
UpdateSiteMap();
|
|
}
|
|
|
|
}
|
|
} |