引用包: Selenium.WebDriver
internal class Program
{
private static readonly TimeSpan DefaultTimeout = TimeSpan.FromSeconds(30);
static void Main(string[] args)
{
Console.WriteLine("Please enter the path of the TSV file:");
string inputPath = Console.ReadLine();
if (!File.Exists(inputPath))
{
Console.WriteLine("The file does not exist!");
Console.ReadKey();
return;
}
string fileName = Path.GetFileNameWithoutExtension(inputPath);
string directory = Path.GetDirectoryName(inputPath);
string outputPath = $@"{directory}\{fileName}_output.tsv";
var lines = new List<string>(File.ReadLines(inputPath));
string baseLink = "https://bingdex.binginternal.com";
using (IWebDriver driver = new EdgeDriver())
{
driver.Manage().Window.Maximize();
WebDriverWait wait = new WebDriverWait(driver, DefaultTimeout);
using (TextWriter writer = new StreamWriter(outputPath))
{
//set header
writer.WriteLine(string.Join("\t", new string[] { "PageUrl", "Crawled_Status", "Crawled_Callisto_Status", "Crawled_Index_Status", "Served_Status", "Served_Reason", "Served_Warning", "Spam_Junk_List" }));
for (int i = 0; i < lines.Count; i++)
{
var pageUrl = lines[i];
try
{
driver.Navigate().GoToUrl($"{baseLink}/home?url={pageUrl}");
Func<IWebDriver, bool> stageStatusCondition = x =>
{
var elements = x.FindElements(By.ClassName("stageStatus"));
return elements.Count == 4 && elements.All(element => !string.IsNullOrEmpty(element.Text));
};
WaitElementLoad(driver, stageStatusCondition);
var stageStatusList = driver.FindElements(By.ClassName("stageStatus"));
string crawled_status_text = stageStatusList[2].Text;
string crawled_status = ReplaceBlank(crawled_status_text);
string crawled_callisto_status = string.Empty;
string crawled_index_status = string.Empty;
string served_reason = string.Empty;
string served_warning = string.Empty;
string served_satus = ReplaceBlank(stageStatusList[3].Text);
if (IsElementPresent(driver, By.XPath("//th[contains(text(),'Callisto status')]")))
{
var callistostatusElement = driver.FindElement(By.XPath("//th[contains(text(),'Callisto status')]"));
string crawled_callisto_status_text = callistostatusElement.FindElement(By.XPath("following-sibling::td")).Text;
crawled_callisto_status = ReplaceBlank(crawled_callisto_status_text);
}
if (IsElementPresent(driver, By.XPath("//th[contains(text(),'Index status')]")))
{
var indexStatusElement = driver.FindElement(By.XPath("//th[contains(text(),'Index status')]"));
var crawled_index_status_text = indexStatusElement.FindElement(By.XPath("following-sibling::td")).Text;
crawled_index_status = ReplaceBlank(crawled_index_status_text);
}
if (IsElementPresent(driver, By.XPath("//th[contains(text(),'Reason')]")))
{
var reasonElement = driver.FindElement(By.XPath("//th[contains(text(),'Reason')]"));
var served_reason_text = reasonElement.FindElement(By.XPath("following-sibling::td")).Text;
served_reason = ReplaceBlank(served_reason_text);
}
if (IsElementPresent(driver, By.XPath("//th[contains(text(),'Warning')]")))
{
var warningElement = driver.FindElement(By.XPath("//th[contains(text(),'Warning')]"));
var served_warning_text = warningElement.FindElement(By.XPath("following-sibling::td")).Text;
served_warning = ReplaceBlank(served_warning_text);
}
var spamJunklistBtn = driver.FindElement(By.XPath("//*[@id=\"root\"]/div/div/div[1]/div/div[2]/button"));
spamJunklistBtn.Click();
Thread.Sleep(2000);
var spamJunkList = GetSpamJunkList(driver);
writer.WriteLine(string.Join("\t", new string[] { pageUrl, crawled_status, crawled_callisto_status, crawled_index_status, served_satus, served_reason, served_warning, spamJunkList }));
}
catch (Exception ex)
{
using (var errorWriter = new StreamWriter($@"{directory}\{fileName}_error.txt", true))
{
errorWriter.WriteLine(pageUrl);
}
}
}
}
Console.WriteLine("output file success!");
Console.ReadKey();
}
}
public static string ReplaceBlank(string str)
{
if (string.IsNullOrEmpty(str))
{
return str;
}
else
{
return str.Contains("\r") ? str.Substring(0, str.IndexOf("\r")) : str;
}
}
public static bool IsElementPresent(IWebDriver driver, By by)
{
try
{
driver.FindElement(by);
return true;
}
catch (NoSuchElementException)
{
return false;
}
}
//Wait for element loading based on custom conditions
public static void WaitElementLoad(IWebDriver driver, Func<IWebDriver, bool> condition)
{
try
{
WebDriverWait wait = new WebDriverWait(driver, DefaultTimeout);
wait.Until(condition);
}
catch (WebDriverTimeoutException ex)
{
Console.WriteLine("Timeout waiting for elements to load inner text: " + ex.Message);
throw ex;
}
}
public static string GetSpamJunkList(IWebDriver driver)
{
string spamJunkList = string.Empty;
List<SpamJunk> spamJunks = new List<SpamJunk>();
try
{
var rows = driver.FindElements(By.ClassName("ms-List-cell"));
if (rows.Count > 0)
{
foreach (var item in rows)
{
if (IsElementPresent(driver, By.ClassName("ms-DetailsRow-cell")))
{
var cells = item.FindElements(By.ClassName("ms-DetailsRow-cell"));
if (cells.Count == 5)
{
spamJunks.Add(new SpamJunk()
{
Group = cells?[0]?.Text,
Lable = cells?[1]?.Text,
Signature = cells?[2]?.Text,
Source = cells?[3]?.Text,
Tier = cells?[4]?.Text
});
}
continue;
}
}
spamJunkList = JsonConvert.SerializeObject(spamJunks);
}
else
{
spamJunkList = "No spam data found";
}
}
catch (NoSuchElementException)
{
spamJunkList = "No spam data found";
}
return spamJunkList;
}
}
class SpamJunk
{
public string Group { get; set; }
public string Lable { get; set; }
public string Signature { get; set; }
public string Source { get; set; }
public string Tier { get; set; }
}
版权归原作者 xiaolanzhu_ 所有, 如有侵权,请联系我们删除。