RssCollectorService.cs 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. using Application.Abstractions.Data;
  2. using Application.Abstractions.News;
  3. using Domain.Entities.News;
  4. using Microsoft.EntityFrameworkCore;
  5. using Microsoft.Extensions.DependencyInjection;
  6. using Microsoft.Extensions.Hosting;
  7. using Microsoft.Extensions.Logging;
  8. using System.Text.Json;
  9. using System.Xml.Linq;
  10. using System.Text.RegularExpressions;
  11. namespace Infrastructure.News;
  12. public sealed class RssCollectorService(
  13. IServiceScopeFactory scopeFactory,
  14. IHttpClientFactory httpClientFactory,
  15. ILogger<RssCollectorService> logger
  16. ) : BackgroundService, IRssCollector
  17. {
  18. private static readonly XNamespace _dc = "http://purl.org/dc/elements/1.1/";
  19. private static readonly XNamespace _content = "http://purl.org/rss/1.0/modules/content/";
  20. private static readonly XNamespace _media = "http://search.yahoo.com/mrss/";
  21. private static readonly XNamespace _slash = "http://purl.org/rss/1.0/modules/slash/";
  22. protected override async Task ExecuteAsync(CancellationToken ct)
  23. {
  24. // 서비스 시작 직후 잠시 대기 (DB 준비)
  25. await Task.Delay(5000, ct);
  26. while (!ct.IsCancellationRequested)
  27. {
  28. try
  29. {
  30. await CollectAllFeedsAsync(ct);
  31. }
  32. catch (OperationCanceledException) when (ct.IsCancellationRequested)
  33. {
  34. break;
  35. }
  36. catch (Exception ex)
  37. {
  38. logger.LogError(ex, "RSS 수집 루프에서 예외 발생");
  39. }
  40. await Task.Delay(TimeSpan.FromMinutes(1), ct);
  41. }
  42. }
  43. public async Task<int> FetchSourceAsync(int sourceID, CancellationToken ct)
  44. {
  45. using var scope = scopeFactory.CreateScope();
  46. var db = scope.ServiceProvider.GetRequiredService<IAppDbContext>();
  47. var source = await db.RssFeedSource.FindAsync([sourceID], ct) ?? throw new KeyNotFoundException($"소스 ID {sourceID}를 찾을 수 없습니다.");
  48. var count = await FetchAndStoreAsync(source, db, ct);
  49. source.MarkFetched();
  50. await db.SaveChangesAsync(ct);
  51. return count;
  52. }
  53. private async Task CollectAllFeedsAsync(CancellationToken ct)
  54. {
  55. using var scope = scopeFactory.CreateScope();
  56. var db = scope.ServiceProvider.GetRequiredService<IAppDbContext>();
  57. // 초기 시드: 테이블이 비어있으면 기본 소스 삽입
  58. if (!await db.RssFeedSource.AnyAsync(ct))
  59. {
  60. await SeedDefaultSourcesAsync(db, ct);
  61. }
  62. var sources = await db.RssFeedSource.Where(x => x.IsActive).ToListAsync(ct);
  63. var now = DateTime.UtcNow;
  64. foreach (var source in sources)
  65. {
  66. // 수집 주기 체크
  67. if (source.LastFetchedAt.HasValue && (now - source.LastFetchedAt.Value).TotalMinutes < source.IntervalMinutes)
  68. {
  69. continue;
  70. }
  71. try
  72. {
  73. var count = await FetchAndStoreAsync(source, db, ct);
  74. source.MarkFetched();
  75. await db.SaveChangesAsync(ct);
  76. if (count > 0)
  77. {
  78. logger.LogInformation("[RSS] {SourceName}: {Count}건 수집 완료", source.Name, count);
  79. }
  80. }
  81. catch (Exception ex)
  82. {
  83. logger.LogWarning(ex, "[RSS] {SourceName} ({Url}) 수집 실패", source.Name, source.Url);
  84. }
  85. }
  86. }
  87. private async Task<int> FetchAndStoreAsync(RssFeedSource source, IAppDbContext db, CancellationToken ct)
  88. {
  89. var client = httpClientFactory.CreateClient("RssFeed");
  90. var xml = await client.GetStringAsync(source.Url, ct);
  91. var doc = XDocument.Parse(xml);
  92. var items = doc.Descendants("item").ToList();
  93. if (items.Count == 0)
  94. {
  95. return 0;
  96. }
  97. // 파싱
  98. var articles = items.Select(item => ParseItem(source.ID, item)).ToList();
  99. // 중복 체크: 기존 Guid 조회
  100. var guids = articles.Where(a => a.Guid != null).Select(a => a.Guid!).ToList();
  101. var existingGuids = await db.RssNewsArticle
  102. .Where(x => x.RssFeedSourceID == source.ID && x.Guid != null && guids.Contains(x.Guid!))
  103. .Select(x => x.Guid!)
  104. .ToListAsync(ct);
  105. var existingSet = existingGuids.ToHashSet();
  106. var newArticles = articles.Where(a => a.Guid == null || !existingSet.Contains(a.Guid)).ToList();
  107. if (newArticles.Count == 0)
  108. {
  109. return 0;
  110. }
  111. db.RssNewsArticle.AddRange(newArticles);
  112. await db.SaveChangesAsync(ct);
  113. return newArticles.Count;
  114. }
  115. private static RssNewsArticle ParseItem(int sourceID, XElement item)
  116. {
  117. var title = item.Element("title")?.Value?.Trim() ?? "";
  118. var link = item.Element("link")?.Value?.Trim();
  119. var guid = item.Element("guid")?.Value?.Trim() ?? link;
  120. var author = item.Element(_dc + "creator")?.Value?.Trim();
  121. var description = item.Element("description")?.Value?.Trim();
  122. var content = item.Element(_content + "encoded")?.Value?.Trim();
  123. var sourceName = item.Element("source")?.Value?.Trim();
  124. // Author fallback: Google News의 source 요소
  125. if (string.IsNullOrEmpty(author) && !string.IsNullOrEmpty(sourceName))
  126. {
  127. author = sourceName;
  128. }
  129. // 이미지: media:content > enclosure > description/content 내 첫 번째 <img>
  130. var imageUrl = item.Element(_media + "content")?.Attribute("url")?.Value?.Trim()
  131. ?? item.Element("enclosure")?.Attribute("url")?.Value?.Trim();
  132. if (string.IsNullOrEmpty(imageUrl))
  133. {
  134. imageUrl = ExtractFirstImageUrl(description) ?? ExtractFirstImageUrl(content);
  135. }
  136. // 카테고리: JSON 배열로 저장
  137. var categories = item.Elements("category").Select(c => c.Value.Trim()).Where(c => !string.IsNullOrEmpty(c)).ToList();
  138. string? categoriesJson = categories.Count > 0 ? JsonSerializer.Serialize(categories) : null;
  139. // 댓글 수
  140. var commentCountStr = item.Element(_slash + "comments")?.Value;
  141. int.TryParse(commentCountStr, out var commentCount);
  142. // 발행일
  143. DateTime? publishedAt = null;
  144. var pubDateStr = item.Element("pubDate")?.Value;
  145. if (!string.IsNullOrEmpty(pubDateStr) && DateTimeOffset.TryParse(pubDateStr, out var dto))
  146. {
  147. publishedAt = dto.UtcDateTime;
  148. }
  149. return RssNewsArticle.Create(
  150. rssFeedSourceID: sourceID,
  151. title: title,
  152. link: link,
  153. guid: guid,
  154. author: author,
  155. description: description,
  156. content: content,
  157. imageUrl: imageUrl,
  158. sourceName: sourceName,
  159. categories: categoriesJson,
  160. commentCount: commentCount,
  161. publishedAt: publishedAt
  162. );
  163. }
  164. private static readonly Regex _imgSrcRegex = new(@"<img[^>]+src\s*=\s*[""']([^""']+)[""']", RegexOptions.IgnoreCase | RegexOptions.Compiled);
  165. private static string? ExtractFirstImageUrl(string? html)
  166. {
  167. if (string.IsNullOrEmpty(html))
  168. {
  169. return null;
  170. }
  171. var match = _imgSrcRegex.Match(html);
  172. return match.Success ? match.Groups[1].Value.Trim() : null;
  173. }
  174. private static async Task SeedDefaultSourcesAsync(IAppDbContext db, CancellationToken ct)
  175. {
  176. db.RssFeedSource.AddRange(
  177. RssFeedSource.Create("CoinSpeaker KR", "https://www.coinspeaker.com/kr/news/feed/", "CoinSpeaker 한국어 뉴스", 10),
  178. RssFeedSource.Create("Google News 비트코인", "https://news.google.com/rss/search?q=%EB%B9%84%ED%8A%B8%EC%BD%94%EC%9D%B8&hl=ko&gl=KR&ceid=KR:ko", "구글 뉴스 비트코인 검색", 15),
  179. RssFeedSource.Create("CoinTelegraph KR", "https://cointelegraph-kr.com/rss", "CoinTelegraph 한국어 뉴스", 10)
  180. );
  181. await db.SaveChangesAsync(ct);
  182. }
  183. }